diff --git a/.github/workflows/manifold.yml b/.github/workflows/manifold.yml
index 2e7f069b4..953f1fd17 100644
--- a/.github/workflows/manifold.yml
+++ b/.github/workflows/manifold.yml
@@ -14,7 +14,8 @@ jobs:
   build:
     strategy:
       matrix:
-        backend: [CPP, OMP, TBB, CUDA]
+        cuda_support: [ON, OFF]
+        parallel_backend: [NONE, OMP, TBB]
     runs-on: ubuntu-20.04
     container:
       image: docker://nvidia/cuda:11.6.0-devel-ubuntu20.04
@@ -33,9 +34,10 @@ jobs:
         patch third_party/assimp/code/CMakeLists.txt < assimp.diff
         mkdir build
         cd build
-        cmake -DCMAKE_BUILD_TYPE=Release -DTHRUST_BACKEND=${{matrix.backend}} .. && make
-    - name: Test ${{matrix.backend}}
-      if: matrix.backend != 'CUDA'
+        cmake -DCMAKE_BUILD_TYPE=Release -DMANIFOLD_PAR=${{matrix.parallel_backend}} -DMANIFOLD_USE_CUDA=${{matrix.cuda_support}} .. && make
+    - name: Test ${{matrix.parallel_backend}} with CUDA ${{matrix.cuda_support}}
+      # note that the test for CUDA backend does not really test CUDA, as we
+      # don't have CUDA GPU on GitHub Action
       run: |
         cd build/test
         ./manifold_test
@@ -74,8 +76,8 @@ jobs:
   build_windows:
     strategy:
       matrix:
-        # backend: [CUDA, CPP]
-        backend: [CPP]
+        parallel_backend: [NONE]
+        cuda_support: [OFF]
       max-parallel: 1
     runs-on: windows-2019
     steps:
@@ -107,11 +109,10 @@ jobs:
     - name: Build ${{matrix.backend}}
       shell: powershell
       run: |
-        cmake . -DCMAKE_BUILD_TYPE=Release -B build -DTHRUST_BACKEND=${{matrix.backend}} -A x64
+        cmake . -DCMAKE_BUILD_TYPE=Release -B build -DMANIFOLD_PAR=${{matrix.parallel_backend}} -DMANIFOLD_USE_CUDA=${{matrix.cuda_support}} -A x64
         cd build
         cmake --build . --target ALL_BUILD --config Release
-    - name: Test ${{matrix.backend}}
-      if: matrix.backend != 'CUDA'
+    - name: Test ${{matrix.parallel_backend}} with CUDA ${{matrix.cuda_support}}
       shell: bash
       run: |
         cd build/test
@@ -120,7 +121,7 @@ jobs:
   build_nix:
     strategy:
       matrix:
-        variant: [cpp, omp, cuda, js, tbb]
+        variant: [none, omp, tbb, none-cuda, omp-cuda, tbb-cuda, js]
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v2.4.0
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b3843c437..d016e6e94 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,13 +9,14 @@ set(CMAKE_VERBOSE_MAKEFILE ON)
 #         )
 # endif()
 
-set(THRUST_BACKEND "CUDA" CACHE STRING
-    "Thrust backend, either \"CUDA\", \"OMP\" or \"CPP\"(single-threaded)"
-)
+option(MANIFOLD_USE_CUDA on)
+set(MANIFOLD_PAR "TBB" CACHE STRING "Parallel backend, either \"TBB\" or \"OpenMP\" or \"NONE\"")
+
 if(EMSCRIPTEN)
     message("Building for Emscripten")
     set(CMAKE_EXE_LINKER_FLAGS ${CMAKE_EXE_LINKER_FLAGS} -sALLOW_MEMORY_GROWTH=1)
-    set(THRUST_BACKEND "CPP")
+    set(MANIFOLD_USE_CUDA OFF)
+    set(MANIFOLD_PAR "NONE")
 endif()
 
 option(BUILD_TEST_CGAL off)
@@ -26,8 +27,6 @@ set(ASSIMP_INC_DIR
     ${CMAKE_BINARY_DIR}/third_party/assimp/include)
 set(GLM_INC_DIR ${PROJECT_SOURCE_DIR}/third_party/glm)
 
-set_property(CACHE THRUST_BACKEND PROPERTY STRINGS CUDA OMP CPP)
-
 option(ASSIMP_FAST_BUILD "build ASSIMP just for tests" ON)
 if(ASSIMP_FAST_BUILD)
     option(ASSIMP_INSTALL FALSE)
@@ -39,39 +38,49 @@ if(ASSIMP_FAST_BUILD)
     endforeach()
 endif()
 
-if(MSVC)
-    set(MANIFOLD_FLAGS)
-else()
-    set(MANIFOLD_FLAGS -Werror -Wall -Wno-sign-compare -Wno-unused)
-endif()
 set(MANIFOLD_NVCC_FLAGS -Xcudafe --diag_suppress=esa_on_defaulted_function_ignored --extended-lambda)
 set(MANIFOLD_NVCC_RELEASE_FLAGS -O3)
 set(MANIFOLD_NVCC_DEBUG_FLAGS -G)
 
 set(THRUST_INC_DIR ${PROJECT_SOURCE_DIR}/third_party/thrust)
 
-message("Using ${THRUST_BACKEND} backend")
-if (THRUST_BACKEND STREQUAL "CUDA")
-    # we cannot set THRUST_INC_DIR when building with CUDA, otherwise the
-    # compiler will not use the system CUDA headers which causes incompatibility
-    enable_language(CUDA)
-    # clear THRUST_INC_DIR, we use the one from nvcc
-    set(THRUST_INC_DIR "")
-elseif (THRUST_BACKEND STREQUAL "OMP")
+message("CUDA Support: ${MANIFOLD_USE_CUDA}")
+message("Parallel Backend: ${MANIFOLD_PAR}")
+
+if (MANIFOLD_PAR STREQUAL "OMP")
     find_package(OpenMP REQUIRED)
     set(MANIFOLD_INCLUDE OpenMP::OpenMP_CXX)
-    set(MANIFOLD_FLAGS ${MANIFOLD_FLAGS} -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_OMP -fopenmp)
-elseif (THRUST_BACKEND STREQUAL "CPP")
-    set(MANIFOLD_FLAGS ${MANIFOLD_FLAGS} -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CPP)
-elseif (THRUST_BACKEND STREQUAL "TBB")
+    set(MANIFOLD_FLAGS ${MANIFOLD_FLAGS} -fopenmp -DMANIFOLD_PAR='O')
+elseif (MANIFOLD_PAR STREQUAL "TBB")
     find_package(PkgConfig REQUIRED)
     pkg_check_modules(TBB REQUIRED tbb)
-
     set(MANIFOLD_INCLUDE ${TBB_LINK_LIBRARIES})
-    set(MANIFOLD_FLAGS ${MANIFOLD_FLAGS} -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_TBB)
+    set(MANIFOLD_FLAGS ${MANIFOLD_FLAGS} -DMANIFOLD_PAR='T')
+elseif (MANIFOLD_PAR STREQUAL "NONE")
+    set(MANIFOLD_FLAGS ${MANIFOLD_FLAGS})
+    set(MANIFOLD_PAR "CPP")
 else ()
-    message(FATAL_ERROR "Invalid value for THRUST_BACKEND: ${THRUST_BACKEND}. "
-    "Should be one of \"CUDA\", \"OMP\", \"CPP\" or \"TBB\"")
+    message(FATAL_ERROR "Invalid value for MANIFOLD_PAR: ${MANIFOLD_PAR}. "
+    "Should be one of \"TBB\", \"OMP\", \"CPP\"")
+endif()
+
+if (MANIFOLD_USE_CUDA)
+    enable_language(CUDA)
+    find_package(CUDA  REQUIRED)
+    # we cannot set THRUST_INC_DIR when building with CUDA, otherwise the
+    # compiler will not use the system CUDA headers which causes incompatibility
+    # clear THRUST_INC_DIR, we use the one from nvcc
+    set(THRUST_INC_DIR "")
+    set(MANIFOLD_FLAGS ${MANIFOLD_FLAGS} -DMANIFOLD_USE_CUDA)
+    set(MANIFOLD_NVCC_FLAGS ${MANIFOLD_NVCC_FLAGS} ${MANIFOLD_FLAGS})
+else ()
+    set(MANIFOLD_FLAGS ${MANIFOLD_FLAGS} -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_${MANIFOLD_PAR})
+endif()
+
+if(MSVC)
+    set(MANIFOLD_FLAGS ${MANIFOLD_FLAGS})
+else()
+    set(MANIFOLD_FLAGS ${MANIFOLD_FLAGS} -Werror -Wall -Wno-sign-compare -Wno-unused)
 endif()
 
 add_subdirectory(utilities)
diff --git a/collider/CMakeLists.txt b/collider/CMakeLists.txt
index d0799679b..c117c37ad 100644
--- a/collider/CMakeLists.txt
+++ b/collider/CMakeLists.txt
@@ -2,7 +2,7 @@ project (collider)
 
 add_library(${PROJECT_NAME} src/collider.cpp)
 
-if(THRUST_BACKEND STREQUAL "CUDA")
+if(MANIFOLD_USE_CUDA)
     set_source_files_properties(src/collider.cpp PROPERTIES LANGUAGE CUDA)
     set_property(TARGET ${PROJECT_NAME} PROPERTY CUDA_ARCHITECTURES 61)
     target_compile_options(${PROJECT_NAME} 
diff --git a/collider/src/collider.cpp b/collider/src/collider.cpp
index f02227ad8..cfa987a45 100644
--- a/collider/src/collider.cpp
+++ b/collider/src/collider.cpp
@@ -14,6 +14,7 @@
 
 #include "collider.h"
 #include "utils.h"
+#include "par.h"
 
 #ifdef _MSC_VER
 #include <intrin.h>
@@ -252,7 +253,7 @@ Collider::Collider(const VecDH<Box>& leafBB,
   nodeParent_.resize(num_nodes, -1);
   internalChildren_.resize(leafBB.size() - 1, thrust::make_pair(-1, -1));
   // organize tree
-  thrust::for_each_n(thrust::device, countAt(0), NumInternal(),
+  for_each_n(autoPolicy(NumInternal()), countAt(0), NumInternal(),
                      CreateRadixTree({nodeParent_.ptrD(),
                                       internalChildren_.ptrD(), leafMorton}));
   UpdateBoxes(leafBB);
@@ -273,8 +274,8 @@ SparseIndices Collider::Collisions(const VecDH<T>& querriesIn) const {
     // scalar number of overlaps found
     VecDH<int> nOverlapsD(1, 0);
     // calculate Bounding Box overlaps
-    thrust::for_each_n(
-        thrust::device, zip(querriesIn.cbegin(), countAt(0)), querriesIn.size(),
+    for_each_n(
+        autoPolicy(querriesIn.size()), zip(querriesIn.cbegin(), countAt(0)), querriesIn.size(),
         FindCollisions<T>({querryTri.ptrDpq(), nOverlapsD.ptrD(), maxOverlaps,
                            nodeBBox_.ptrD(), internalChildren_.ptrD()}));
     nOverlaps = nOverlapsD[0];
@@ -305,13 +306,13 @@ void Collider::UpdateBoxes(const VecDH<Box>& leafBB) {
   // copy in leaf node Boxs
   strided_range<VecDH<Box>::Iter> leaves(nodeBBox_.begin(), nodeBBox_.end(),
                                           2);
-  thrust::copy(thrust::device, leafBB.cbegin(), leafBB.cend(), leaves.begin());
+  auto policy = autoPolicy(NumInternal());
+  copy(policy, leafBB.cbegin(), leafBB.cend(), leaves.begin());
   // create global counters
-  VecDH<int> counter(NumInternal());
-  thrust::fill(thrust::device, counter.begin(), counter.end(), 0);
+  VecDH<int> counter(NumInternal(), 0);
   // kernel over leaves to save internal Boxs
-  thrust::for_each_n(
-      thrust::device, countAt(0), NumLeaves(),
+  for_each_n(
+      policy, countAt(0), NumLeaves(),
       BuildInternalBoxes({nodeBBox_.ptrD(), counter.ptrD(), nodeParent_.ptrD(),
                           internalChildren_.ptrD()}));
 }
@@ -330,7 +331,7 @@ bool Collider::Transform(glm::mat4x3 transform) {
     if (count != 2) axisAligned = false;
   }
   if (axisAligned) {
-    thrust::for_each(thrust::device, nodeBBox_.begin(), nodeBBox_.end(),
+    for_each(autoPolicy(nodeBBox_.size()), nodeBBox_.begin(), nodeBBox_.end(),
                      TransformBox({transform}));
   }
   return axisAligned;
diff --git a/flake.nix b/flake.nix
index 8d917a196..08578a91a 100644
--- a/flake.nix
+++ b/flake.nix
@@ -10,33 +10,65 @@
             inherit system;
             config.allowUnfree = true;
           };
-          manifold = { backend ? "CPP", doCheck ? true, build-tools ? [ ], runtime ? [ ] }: pkgs.stdenv.mkDerivation {
-            inherit doCheck;
-            pname = "manifold-${backend}";
-            version = "beta";
-            src = self;
-            patches = [ ./assimp.diff ];
-            nativeBuildInputs = (with pkgs; [ cmake python38 ]) ++ build-tools;
-            buildInputs = runtime;
-            cmakeFlags = [ "-DTHRUST_BACKEND=${backend}" ];
-            checkPhase = ''
-              cd test
-              ./manifold_test
-              cd ../
-            '';
-            installPhase = ''
-              mkdir -p $out
-              cp manifold/libmanifold.a $out/
-              cp meshIO/libmeshIO.a $out/
-              cp tools/loadMesh $out
-              cp tools/perfTest $out
-            '';
-          };
+          manifold =
+            { parallel-backend ? "none"
+            , cuda-support ? false
+            , doCheck ? true
+            , build-tools ? [ ]
+            , ...
+            }: pkgs.stdenv.mkDerivation {
+              inherit doCheck;
+              pname =
+                if cuda-support then
+                  "manifold-${parallel-backend}-cuda"
+                else
+                  "manifold-${parallel-backend}";
+              version = "beta";
+              src = self;
+              patches = [ ./assimp.diff ];
+              nativeBuildInputs = (with pkgs; [ cmake python38 ]) ++ build-tools ++
+                (if cuda-support then [ pkgs.cudatoolkit_11_5 pkgs.addOpenGLRunpath ] else [ ]);
+              cmakeFlags = [
+                "-DMANIFOLD_PAR=${pkgs.lib.strings.toUpper parallel-backend}"
+                "-DMANIFOLD_USE_CUDA=${if cuda-support then "ON" else "OFF"}"
+              ];
+              checkPhase = ''
+                cd test
+                ./manifold_test
+                cd ../
+              '';
+              installPhase = ''
+                mkdir -p $out
+                cp manifold/libmanifold.a $out/
+                cp meshIO/libmeshIO.a $out/
+                cp tools/loadMesh $out
+                cp tools/perfTest $out
+              '';
+            };
+          parallelBackends = [
+            { parallel-backend = "none"; }
+            {
+              parallel-backend = "omp";
+              build-tools = [ pkgs.llvmPackages_13.openmp ];
+            }
+            {
+              parallel-backend = "tbb";
+              build-tools = with pkgs; [ tbb pkg-config ];
+            }
+          ];
+          buildMatrix = with pkgs; with lib; lists.flatten (map
+            (env: map
+              (x: x // env)
+              parallelBackends) [
+            { cuda-support = false; }
+            {
+              cuda-support = true;
+            }
+          ]);
           devShell = { additional ? [ ] }: pkgs.mkShell {
             buildInputs = with pkgs; [
               cmake
-              ccls
-              llvmPackages.openmp
+              llvmPackages_13.openmp
               clang-tools
               clang_13
               emscripten
@@ -45,48 +77,45 @@
           };
         in
         {
-          packages.manifold-cpp = manifold { };
-          packages.manifold-omp = manifold { backend = "OMP"; runtime = [ pkgs.llvmPackages.openmp ]; };
-          packages.manifold-tbb = manifold { backend = "TBB"; runtime = [ pkgs.tbb pkgs.pkg-config ]; };
-          packages.manifold-cuda = manifold {
-            backend = "CUDA";
-            runtime = [
-              pkgs.cudaPackages.cudatoolkit_11
-            ];
-            doCheck = false;
-          };
-          packages.manifold-js = pkgs.buildEmscriptenPackage {
-            name = "manifold-js";
-            version = "beta";
-            src = self;
-            patches = [ ./assimp.diff ];
-            nativeBuildInputs = (with pkgs; [ cmake python38 ]);
-            buildInputs = [ pkgs.nodejs ];
-            configurePhase = ''
-              mkdir build
-              cd build
-              emcmake cmake -DCMAKE_BUILD_TYPE=Release ..
-            '';
-            buildPhase = ''
-              emmake make
-            '';
-            checkPhase = ''
-              cd test
-              node manifold_test.js
-              cd ../
-            '';
-            installPhase = ''
-              mkdir -p $out
-              cd tools
-              cp *.js $out/
-              cp *.wasm $out/
-            '';
+          packages = (builtins.listToAttrs
+            (map
+              (x: {
+                name = "manifold-" + x.parallel-backend + (if
+                  x.cuda-support then "-cuda" else "");
+                value = manifold x;
+              })
+              buildMatrix)) // {
+            manifold-js = pkgs.buildEmscriptenPackage {
+              name = "manifold-js";
+              version = "beta";
+              src = self;
+              patches = [ ./assimp.diff ];
+              nativeBuildInputs = (with pkgs; [ cmake python38 ]);
+              buildInputs = [ pkgs.nodejs ];
+              configurePhase = ''
+                mkdir build
+                cd build
+                emcmake cmake -DCMAKE_BUILD_TYPE=Release ..
+              '';
+              buildPhase = ''
+                emmake make
+              '';
+              checkPhase = ''
+                cd test
+                node manifold_test.js
+                cd ../
+              '';
+              installPhase = ''
+                mkdir -p $out
+                cd tools
+                cp *.js $out/
+                cp *.wasm $out/
+              '';
+            };
           };
           devShell = devShell { };
           devShells.cuda = devShell {
-            additional = [
-              pkgs.cudaPackages.cudatoolkit_11
-            ];
+            additional = [ pkgs.cudatoolkit_11_5 ];
           };
         }
       );
diff --git a/manifold/CMakeLists.txt b/manifold/CMakeLists.txt
index 7cc4cb4b4..482166f1e 100644
--- a/manifold/CMakeLists.txt
+++ b/manifold/CMakeLists.txt
@@ -5,7 +5,7 @@ add_library(${PROJECT_NAME} src/manifold.cpp src/constructors.cpp src/impl.cpp
     src/smoothing.cpp src/boolean3.cpp src/boolean_result.cpp
 )
 
-if(THRUST_BACKEND STREQUAL "CUDA")
+if(MANIFOLD_USE_CUDA)
     set_source_files_properties(
         src/manifold.cpp src/constructors.cpp src/impl.cpp
         src/properties.cpp src/sort.cpp src/edge_op.cpp src/face_op.cpp
diff --git a/manifold/src/boolean3.cpp b/manifold/src/boolean3.cpp
index 5bd239332..0bfa89d61 100644
--- a/manifold/src/boolean3.cpp
+++ b/manifold/src/boolean3.cpp
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "boolean3.h"
+#include "par.h"
 #include <limits>
 
 // TODO: make this runtime configurable for quicker debug
@@ -85,12 +86,13 @@ struct CopyFaceEdges {
 SparseIndices Filter11(const Manifold::Impl &inP, const Manifold::Impl &inQ,
                        const SparseIndices &p1q2, const SparseIndices &p2q1) {
   SparseIndices p1q1(3 * p1q2.size() + 3 * p2q1.size());
-  thrust::for_each_n(thrust::device, zip(countAt(0), p1q2.begin(0), p1q2.begin(1)),
+  auto policy = autoPolicy(p1q2.size());
+  for_each_n(policy, zip(countAt(0), p1q2.begin(0), p1q2.begin(1)),
                      p1q2.size(),
                      CopyFaceEdges({p1q1.ptrDpq(), inQ.halfedge_.cptrD()}));
 
   p1q1.SwapPQ();
-  thrust::for_each_n(thrust::device, zip(countAt(p1q2.size()), p2q1.begin(1), p2q1.begin(0)),
+  for_each_n(policy, zip(countAt(p1q2.size()), p2q1.begin(1), p2q1.begin(0)),
                      p2q1.size(),
                      CopyFaceEdges({p1q1.ptrDpq(), inP.halfedge_.cptrD()}));
   p1q1.SwapPQ();
@@ -245,8 +247,8 @@ std::tuple<VecDH<int>, VecDH<glm::vec4>> Shadow11(SparseIndices &p1q1,
   VecDH<int> s11(p1q1.size());
   VecDH<glm::vec4> xyzz11(p1q1.size());
 
-  thrust::for_each_n(
-      thrust::device, zip(xyzz11.begin(), s11.begin(), p1q1.begin(0), p1q1.begin(1)),
+  for_each_n(
+      autoPolicy(p1q1.size()), zip(xyzz11.begin(), s11.begin(), p1q1.begin(0), p1q1.begin(1)),
       p1q1.size(),
       Kernel11({inP.vertPos_.cptrD(), inQ.vertPos_.cptrD(),
                 inP.halfedge_.cptrD(), inQ.halfedge_.cptrD(), expandP,
@@ -342,8 +344,8 @@ std::tuple<VecDH<int>, VecDH<float>> Shadow02(const Manifold::Impl &inP,
 
   auto vertNormalP =
       forward ? inP.vertNormal_.cptrD() : inQ.vertNormal_.cptrD();
-  thrust::for_each_n(
-      thrust::device, zip(s02.begin(), z02.begin(), p0q2.begin(!forward),
+  for_each_n(
+      autoPolicy(p0q2.size()), zip(s02.begin(), z02.begin(), p0q2.begin(!forward),
           p0q2.begin(forward)),
       p0q2.size(),
       Kernel02({inP.vertPos_.cptrD(), inQ.halfedge_.cptrD(),
@@ -452,8 +454,8 @@ std::tuple<VecDH<int>, VecDH<glm::vec3>> Intersect12(
   VecDH<int> x12(p1q2.size());
   VecDH<glm::vec3> v12(p1q2.size());
 
-  thrust::for_each_n(
-      thrust::device, zip(x12.begin(), v12.begin(), p1q2.begin(!forward),
+  for_each_n(
+      autoPolicy(p1q2.size()), zip(x12.begin(), v12.begin(), p1q2.begin(!forward),
           p1q2.begin(forward)),
       p1q2.size(),
       Kernel12({p0q2.ptrDpq(), s02.ptrD(), z02.cptrD(), p0q2.size(),
@@ -471,19 +473,21 @@ VecDH<int> Winding03(const Manifold::Impl &inP, SparseIndices &p0q2,
   // verts that are not shadowed (not in p0q2) have winding number zero.
   VecDH<int> w03(inP.NumVert(), 0);
 
-  if (!thrust::is_sorted(thrust::device, p0q2.begin(reverse), p0q2.end(reverse)))
-    thrust::sort_by_key(thrust::device, p0q2.begin(reverse), p0q2.end(reverse), s02.begin());
+  auto policy = autoPolicy(p0q2.size());
+  if (!is_sorted(policy, p0q2.begin(reverse), p0q2.end(reverse)))
+    sort_by_key(policy, p0q2.begin(reverse), p0q2.end(reverse), s02.begin());
   VecDH<int> w03val(w03.size());
   VecDH<int> w03vert(w03.size());
   // sum known s02 values into w03 (winding number)
   auto endPair =
-      thrust::reduce_by_key(thrust::device, p0q2.begin(reverse), p0q2.end(reverse),
+      reduce_by_key<thrust::pair<decltype(w03val.begin()), decltype(w03val.begin())>>
+      (policy, p0q2.begin(reverse), p0q2.end(reverse),
                             s02.begin(), w03vert.begin(), w03val.begin());
-  thrust::scatter(thrust::device, w03val.begin(), endPair.second, w03vert.begin(),
+  scatter(autoPolicy(endPair.second - w03val.begin()), w03val.begin(), endPair.second, w03vert.begin(),
                   w03.begin());
 
   if (reverse)
-    thrust::transform(thrust::device, w03.begin(), w03.end(), w03.begin(),
+    transform(autoPolicy(w03.size()), w03.begin(), w03.end(), w03.begin(),
                       thrust::negate<int>());
   return w03;
 };
diff --git a/manifold/src/boolean_result.cpp b/manifold/src/boolean_result.cpp
index 5c072d430..50ac45e00 100644
--- a/manifold/src/boolean_result.cpp
+++ b/manifold/src/boolean_result.cpp
@@ -17,6 +17,7 @@
 
 #include "boolean3.h"
 #include "polygon.h"
+#include "par.h"
 
 // TODO: make this runtime configurable for quicker debug
 constexpr bool kVerbose = false;
@@ -83,31 +84,33 @@ std::tuple<VecDH<int>, VecDH<int>> SizeOutput(
     const VecDH<int> &i03, const VecDH<int> &i30, const VecDH<int> &i12,
     const VecDH<int> &i21, const SparseIndices &p1q2, const SparseIndices &p2q1,
     bool invertQ) {
-  VecDH<int> sidesPerFacePQ(inP.NumTri() + inQ.NumTri());
+  VecDH<int> sidesPerFacePQ(inP.NumTri() + inQ.NumTri(), 0);
   auto sidesPerFaceP = sidesPerFacePQ.ptrD();
   auto sidesPerFaceQ = sidesPerFacePQ.ptrD() + inP.NumTri();
 
-  thrust::for_each(thrust::device, inP.halfedge_.begin(), inP.halfedge_.end(),
+  auto policy = autoPolicy(std::max(inP.halfedge_.size(), inQ.halfedge_.size()));
+
+  for_each(policy, inP.halfedge_.begin(), inP.halfedge_.end(),
                    CountVerts({sidesPerFaceP, i03.cptrD()}));
-  thrust::for_each(thrust::device, inQ.halfedge_.begin(), inQ.halfedge_.end(),
+  for_each(policy, inQ.halfedge_.begin(), inQ.halfedge_.end(),
                    CountVerts({sidesPerFaceQ, i30.cptrD()}));
-  thrust::for_each_n(
-      thrust::device, zip(p1q2.begin(0), p1q2.begin(1), i12.begin()), i12.size(),
+  for_each_n(
+      policy, zip(p1q2.begin(0), p1q2.begin(1), i12.begin()), i12.size(),
       CountNewVerts({sidesPerFaceP, sidesPerFaceQ, inP.halfedge_.cptrD()}));
-  thrust::for_each_n(
-      thrust::device, zip(p2q1.begin(1), p2q1.begin(0), i21.begin()), i21.size(),
+  for_each_n(
+      policy, zip(p2q1.begin(1), p2q1.begin(0), i21.begin()), i21.size(),
       CountNewVerts({sidesPerFaceQ, sidesPerFaceP, inQ.halfedge_.cptrD()}));
 
-  VecDH<int> facePQ2R(inP.NumTri() + inQ.NumTri() + 1);
+  VecDH<int> facePQ2R(inP.NumTri() + inQ.NumTri() + 1, 0);
   auto keepFace =
       thrust::make_transform_iterator(sidesPerFacePQ.begin(), NotZero());
-  thrust::inclusive_scan(thrust::device, keepFace, keepFace + sidesPerFacePQ.size(),
+  inclusive_scan(policy, keepFace, keepFace + sidesPerFacePQ.size(),
                          facePQ2R.begin() + 1);
   int numFaceR = facePQ2R.back();
   facePQ2R.resize(inP.NumTri() + inQ.NumTri());
 
   outR.faceNormal_.resize(numFaceR);
-  auto next = thrust::copy_if(thrust::device, inP.faceNormal_.begin(), inP.faceNormal_.end(),
+  auto next = copy_if<decltype(outR.faceNormal_.begin())>(policy, inP.faceNormal_.begin(), inP.faceNormal_.end(),
                               keepFace, outR.faceNormal_.begin(),
                               thrust::identity<bool>());
   if (invertQ) {
@@ -115,17 +118,17 @@ std::tuple<VecDH<int>, VecDH<int>> SizeOutput(
                                                  thrust::negate<glm::vec3>());
     auto end = thrust::make_transform_iterator(inQ.faceNormal_.end(),
                                                thrust::negate<glm::vec3>());
-    thrust::copy_if(thrust::device, start, end, keepFace + inP.NumTri(), next,
+    copy_if<decltype(inQ.faceNormal_.begin())>(policy, start, end, keepFace + inP.NumTri(), next,
                     thrust::identity<bool>());
   } else {
-    thrust::copy_if(thrust::device, inQ.faceNormal_.begin(), inQ.faceNormal_.end(),
+    copy_if<decltype(inQ.faceNormal_.begin())>(policy, inQ.faceNormal_.begin(), inQ.faceNormal_.end(),
                     keepFace + inP.NumTri(), next, thrust::identity<bool>());
   }
 
   auto newEnd =
-      thrust::remove(thrust::device, sidesPerFacePQ.begin(), sidesPerFacePQ.end(), 0);
-  VecDH<int> faceEdge(newEnd - sidesPerFacePQ.begin() + 1);
-  thrust::inclusive_scan(thrust::device, sidesPerFacePQ.begin(), newEnd,
+      remove<decltype(sidesPerFacePQ.begin())>(policy, sidesPerFacePQ.begin(), sidesPerFacePQ.end(), 0);
+  VecDH<int> faceEdge(newEnd - sidesPerFacePQ.begin() + 1, 0);
+  inclusive_scan(policy, sidesPerFacePQ.begin(), newEnd,
                          faceEdge.begin() + 1);
   outR.halfedge_.resize(faceEdge.back());
 
@@ -409,8 +412,8 @@ void AppendWholeEdges(Manifold::Impl &outR, VecDH<int> &facePtrR,
                       const VecDH<char> wholeHalfedgeP, const VecDH<int> &i03,
                       const VecDH<int> &vP2R, const int *faceP2R,
                       bool forward) {
-  thrust::for_each_n(
-      thrust::device, zip(wholeHalfedgeP.begin(), inP.halfedge_.begin(), countAt(0)),
+  for_each_n(
+      autoPolicy(inP.halfedge_.size()), zip(wholeHalfedgeP.begin(), inP.halfedge_.begin(), countAt(0)),
       inP.halfedge_.size(),
       DuplicateHalfedges({outR.halfedge_.ptrD(), halfedgeRef.ptrD(),
                           facePtrR.ptrD(), inP.halfedge_.cptrD(), i03.cptrD(),
@@ -489,8 +492,8 @@ std::pair<VecDH<BaryRef>, VecDH<int>> CalculateMeshRelation(
   VecDH<BaryRef> faceRef(numFaceR);
   VecDH<int> halfedgeBary(halfedgeRef.size());
   VecDH<int> idx(1, 0);
-  thrust::for_each_n(
-      thrust::device, zip(halfedgeBary.begin(), halfedgeRef.begin(),
+  for_each_n(
+      autoPolicy(halfedgeRef.size()), zip(halfedgeBary.begin(), halfedgeRef.begin(),
           outR.halfedge_.cbegin()),
       halfedgeRef.size(),
       CreateBarycentric(
@@ -558,25 +561,27 @@ Manifold::Impl Boolean3::Result(Manifold::OpType op) const {
   VecDH<int> i21(x21_.size());
   VecDH<int> i03(w03_.size());
   VecDH<int> i30(w30_.size());
-  thrust::transform(thrust::device, x12_.begin(), x12_.end(), i12.begin(), c3 * _1);
-  thrust::transform(thrust::device, x21_.begin(), x21_.end(), i21.begin(), c3 * _1);
-  thrust::transform(thrust::device, w03_.begin(), w03_.end(), i03.begin(), c1 + c3 * _1);
-  thrust::transform(thrust::device, w30_.begin(), w30_.end(), i30.begin(), c2 + c3 * _1);
+  auto policy = autoPolicy(std::max(std::max(x12_.size(), x21_.size()), std::max(w03_.size(), w30_.size())));
+
+  transform(policy, x12_.begin(), x12_.end(), i12.begin(), c3 * _1);
+  transform(policy, x21_.begin(), x21_.end(), i21.begin(), c3 * _1);
+  transform(policy, w03_.begin(), w03_.end(), i03.begin(), c1 + c3 * _1);
+  transform(policy, w30_.begin(), w30_.end(), i30.begin(), c2 + c3 * _1);
 
   VecDH<int> vP2R(inP_.NumVert());
-  thrust::exclusive_scan(thrust::device, i03.begin(), i03.end(), vP2R.begin(), 0, AbsSum());
+  exclusive_scan(policy, i03.begin(), i03.end(), vP2R.begin(), 0, AbsSum());
   int numVertR = AbsSum()(vP2R.back(), i03.back());
   const int nPv = numVertR;
 
   VecDH<int> vQ2R(inQ_.NumVert());
-  thrust::exclusive_scan(thrust::device, i30.begin(), i30.end(), vQ2R.begin(), numVertR,
+  exclusive_scan(policy, i30.begin(), i30.end(), vQ2R.begin(), numVertR,
                          AbsSum());
   numVertR = AbsSum()(vQ2R.back(), i30.back());
   const int nQv = numVertR - nPv;
 
   VecDH<int> v12R(v12_.size());
   if (v12_.size() > 0) {
-    thrust::exclusive_scan(thrust::device, i12.begin(), i12.end(), v12R.begin(), numVertR,
+    exclusive_scan(policy, i12.begin(), i12.end(), v12R.begin(), numVertR,
                            AbsSum());
     numVertR = AbsSum()(v12R.back(), i12.back());
   }
@@ -584,7 +589,7 @@ Manifold::Impl Boolean3::Result(Manifold::OpType op) const {
 
   VecDH<int> v21R(v21_.size());
   if (v21_.size() > 0) {
-    thrust::exclusive_scan(thrust::device, i21.begin(), i21.end(), v21R.begin(), numVertR,
+    exclusive_scan(policy, i21.begin(), i21.end(), v21R.begin(), numVertR,
                            AbsSum());
     numVertR = AbsSum()(v21R.back(), i21.back());
   }
@@ -600,14 +605,14 @@ Manifold::Impl Boolean3::Result(Manifold::OpType op) const {
   outR.vertPos_.resize(numVertR);
   // Add vertices, duplicating for inclusion numbers not in [-1, 1].
   // Retained vertices from P and Q:
-  thrust::for_each_n(thrust::device, zip(i03.begin(), vP2R.begin(), inP_.vertPos_.begin()),
+  for_each_n(policy, zip(i03.begin(), vP2R.begin(), inP_.vertPos_.begin()),
                      inP_.NumVert(), DuplicateVerts({outR.vertPos_.ptrD()}));
-  thrust::for_each_n(thrust::device, zip(i30.begin(), vQ2R.begin(), inQ_.vertPos_.begin()),
+  for_each_n(policy, zip(i30.begin(), vQ2R.begin(), inQ_.vertPos_.begin()),
                      inQ_.NumVert(), DuplicateVerts({outR.vertPos_.ptrD()}));
   // New vertices created from intersections:
-  thrust::for_each_n(thrust::device, zip(i12.begin(), v12R.begin(), v12_.begin()),
+  for_each_n(policy, zip(i12.begin(), v12R.begin(), v12_.begin()),
                      i12.size(), DuplicateVerts({outR.vertPos_.ptrD()}));
-  thrust::for_each_n(thrust::device, zip(i21.begin(), v21R.begin(), v21_.begin()),
+  for_each_n(policy, zip(i21.begin(), v21R.begin(), v21_.begin()),
                      i21.size(), DuplicateVerts({outR.vertPos_.ptrD()}));
 
   if (kVerbose) {
diff --git a/manifold/src/constructors.cpp b/manifold/src/constructors.cpp
index 743f534d0..113b3ec3d 100644
--- a/manifold/src/constructors.cpp
+++ b/manifold/src/constructors.cpp
@@ -17,6 +17,7 @@
 #include "graph.h"
 #include "impl.h"
 #include "polygon.h"
+#include "par.h"
 
 namespace {
 using namespace manifold;
@@ -183,7 +184,7 @@ Manifold Manifold::Sphere(float radius, int circularSegments) {
   Manifold sphere;
   sphere.pImpl_ = std::make_unique<Impl>(Impl::Shape::OCTAHEDRON);
   sphere.pImpl_->Subdivide(n);
-  thrust::for_each_n(thrust::device, sphere.pImpl_->vertPos_.begin(), sphere.NumVert(),
+  for_each_n(autoPolicy(sphere.NumVert()), sphere.pImpl_->vertPos_.begin(), sphere.NumVert(),
                      ToSphere({radius}));
   sphere.pImpl_->Finish();
   // Ignore preceding octahedron.
@@ -394,6 +395,7 @@ Manifold Manifold::Compose(const std::vector<Manifold>& manifolds) {
   combined.halfedgeTangent_.resize(2 * numEdge);
   combined.meshRelation_.barycentric.resize(numBary);
   combined.meshRelation_.triBary.resize(numTri);
+  auto policy = autoPolicy(numTri);
 
   int nextVert = 0;
   int nextEdge = 0;
@@ -403,20 +405,20 @@ Manifold Manifold::Compose(const std::vector<Manifold>& manifolds) {
     const Impl& impl = *(manifold.pImpl_);
     impl.ApplyTransform();
 
-    thrust::copy(thrust::device, impl.vertPos_.begin(), impl.vertPos_.end(),
+    copy(policy, impl.vertPos_.begin(), impl.vertPos_.end(),
                  combined.vertPos_.begin() + nextVert);
-    thrust::copy(thrust::device, impl.faceNormal_.begin(), impl.faceNormal_.end(),
+    copy(policy, impl.faceNormal_.begin(), impl.faceNormal_.end(),
                  combined.faceNormal_.begin() + nextTri);
-    thrust::copy(thrust::device, impl.halfedgeTangent_.begin(), impl.halfedgeTangent_.end(),
+    copy(policy, impl.halfedgeTangent_.begin(), impl.halfedgeTangent_.end(),
                  combined.halfedgeTangent_.begin() + nextEdge);
-    thrust::copy(thrust::device, impl.meshRelation_.barycentric.begin(),
+    copy(policy, impl.meshRelation_.barycentric.begin(),
                  impl.meshRelation_.barycentric.end(),
                  combined.meshRelation_.barycentric.begin() + nextBary);
-    thrust::transform(thrust::device, impl.meshRelation_.triBary.begin(),
+    transform(policy, impl.meshRelation_.triBary.begin(),
                       impl.meshRelation_.triBary.end(),
                       combined.meshRelation_.triBary.begin() + nextTri,
                       UpdateTriBary({nextBary}));
-    thrust::transform(thrust::device, impl.halfedge_.begin(), impl.halfedge_.end(),
+    transform(policy, impl.halfedge_.begin(), impl.halfedge_.end(),
                       combined.halfedge_.begin() + nextEdge,
                       UpdateHalfedge({nextVert, nextEdge, nextTri}));
 
@@ -477,22 +479,24 @@ std::vector<Manifold> Manifold::Decompose() const {
   for (int i = 0; i < numLabel; ++i) {
     meshes[i].pImpl_->vertPos_.resize(NumVert());
     VecDH<int> vertNew2Old(NumVert());
+    auto policy = autoPolicy(NumVert());
+    auto start = zip(meshes[i].pImpl_->vertPos_.begin(), vertNew2Old.begin());
     int nVert =
-        thrust::copy_if(
-            thrust::device, zip(pImpl_->vertPos_.begin(), countAt(0)),
+        copy_if<decltype(start)>(
+            policy, zip(pImpl_->vertPos_.begin(), countAt(0)),
             zip(pImpl_->vertPos_.end(), countAt(NumVert())),
             vertLabel.begin(),
             zip(meshes[i].pImpl_->vertPos_.begin(), vertNew2Old.begin()),
             Equals({i})) -
-        zip(meshes[i].pImpl_->vertPos_.begin(), countAt(0));
+        start;
     meshes[i].pImpl_->vertPos_.resize(nVert);
 
     VecDH<int> faceNew2Old(NumTri());
-    thrust::sequence(thrust::device, faceNew2Old.begin(), faceNew2Old.end());
+    sequence(policy, faceNew2Old.begin(), faceNew2Old.end());
 
     int nFace =
-        thrust::remove_if(
-            thrust::device, faceNew2Old.begin(), faceNew2Old.end(),
+        remove_if<decltype(faceNew2Old.begin())>(
+            policy, faceNew2Old.begin(), faceNew2Old.end(),
             RemoveFace({pImpl_->halfedge_.cptrD(), vertLabel.cptrD(), i})) -
         faceNew2Old.begin();
     faceNew2Old.resize(nFace);
diff --git a/manifold/src/edge_op.cpp b/manifold/src/edge_op.cpp
index eecf01568..566985318 100644
--- a/manifold/src/edge_op.cpp
+++ b/manifold/src/edge_op.cpp
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "impl.h"
+#include "par.h"
 
 namespace {
 using namespace manifold;
@@ -119,9 +120,10 @@ namespace manifold {
  */
 void Manifold::Impl::SimplifyTopology() {
   VecDH<int> flaggedEdges(halfedge_.size());
+  auto policy = autoPolicy(halfedge_.size());
   int numFlagged =
-      thrust::copy_if(
-          thrust::device, countAt(0), countAt(halfedge_.size()), flaggedEdges.begin(),
+      copy_if<decltype(flaggedEdges.begin())>(
+          policy, countAt(0), countAt(halfedge_.size()), flaggedEdges.begin(),
           ShortEdge({halfedge_.cptrD(), vertPos_.cptrD(), precision_})) -
       flaggedEdges.begin();
   flaggedEdges.resize(numFlagged);
@@ -130,8 +132,8 @@ void Manifold::Impl::SimplifyTopology() {
 
   flaggedEdges.resize(halfedge_.size());
   numFlagged =
-      thrust::copy_if(
-          thrust::device, countAt(0), countAt(halfedge_.size()), flaggedEdges.begin(),
+      copy_if<decltype(flaggedEdges.begin())>(
+          policy, countAt(0), countAt(halfedge_.size()), flaggedEdges.begin(),
           FlagEdge({halfedge_.cptrD(), meshRelation_.triBary.cptrD()})) -
       flaggedEdges.begin();
   flaggedEdges.resize(numFlagged);
@@ -139,8 +141,8 @@ void Manifold::Impl::SimplifyTopology() {
   for (const int edge : flaggedEdges) CollapseEdge(edge);
 
   flaggedEdges.resize(halfedge_.size());
-  numFlagged = thrust::copy_if(
-                   thrust::device, countAt(0), countAt(halfedge_.size()), flaggedEdges.begin(),
+  numFlagged = copy_if<decltype(flaggedEdges.begin())>(
+                   policy, countAt(0), countAt(halfedge_.size()), flaggedEdges.begin(),
                    SwappableEdge({halfedge_.cptrD(), vertPos_.cptrD(),
                                   faceNormal_.cptrD(), precision_})) -
                flaggedEdges.begin();
diff --git a/manifold/src/impl.cpp b/manifold/src/impl.cpp
index 3ffd4e11c..909716f92 100644
--- a/manifold/src/impl.cpp
+++ b/manifold/src/impl.cpp
@@ -20,6 +20,7 @@
 
 #include "graph.h"
 #include "impl.h"
+#include "par.h"
 
 namespace {
 using namespace manifold;
@@ -357,7 +358,7 @@ Manifold::Impl::Impl(Shape shape) {
 void Manifold::Impl::ReinitializeReference(int meshID) {
   // instead of storing the meshID, we store 0 and set the mapping to
   // 0 -> meshID, because the meshID after boolean operation also starts from 0.
-  thrust::for_each_n(thrust::device, zip(meshRelation_.triBary.begin(), countAt(0)), NumTri(),
+  for_each_n(autoPolicy(NumTri()), zip(meshRelation_.triBary.begin(), countAt(0)), NumTri(),
                      InitializeBaryRef({0, halfedge_.cptrD()}));
   meshRelation_.originalID.clear();
   meshRelation_.originalID[0] = meshID;
@@ -386,7 +387,7 @@ int Manifold::Impl::InitializeNewReference(
                   "propertyTolerance.");
 
     const int numSets = properties.size() / numProps;
-    ALWAYS_ASSERT(thrust::all_of(thrust::device, triPropertiesD.begin(), triPropertiesD.end(),
+    ALWAYS_ASSERT(all_of(autoPolicy(triProperties.size()), triPropertiesD.begin(), triPropertiesD.end(),
                                  CheckProperties({numSets})),
                   userErr,
                   "triProperties value is outside the properties range.");
@@ -394,8 +395,8 @@ int Manifold::Impl::InitializeNewReference(
 
   VecDH<thrust::pair<int, int>> face2face(halfedge_.size(), {-1, -1});
   VecDH<float> triArea(NumTri());
-  thrust::for_each_n(
-      thrust::device, zip(face2face.begin(), countAt(0)), halfedge_.size(),
+  for_each_n(
+      autoPolicy(halfedge_.size()), zip(face2face.begin(), countAt(0)), halfedge_.size(),
       CoplanarEdge({triArea.ptrD(), halfedge_.cptrD(), vertPos_.cptrD(),
                     triPropertiesD.cptrD(), propertiesD.cptrD(),
                     propertyToleranceD.cptrD(), numProps, precision_}));
@@ -477,10 +478,11 @@ void Manifold::Impl::CreateHalfedges(const VecDH<glm::ivec3>& triVerts) {
   const int numTri = triVerts.size();
   halfedge_.resize(3 * numTri);
   VecDH<TmpEdge> edge(3 * numTri);
-  thrust::for_each_n(thrust::device, zip(countAt(0), triVerts.begin()), numTri,
+  auto policy = autoPolicy(numTri);
+  for_each_n(policy, zip(countAt(0), triVerts.begin()), numTri,
                      Tri2Halfedges({halfedge_.ptrD(), edge.ptrD()}));
-  thrust::sort(thrust::device, edge.begin(), edge.end());
-  thrust::for_each_n(thrust::device, countAt(0), halfedge_.size() / 2,
+  sort(policy, edge.begin(), edge.end());
+  for_each_n(policy, countAt(0), halfedge_.size() / 2,
                      LinkHalfedges({halfedge_.ptrD(), edge.cptrD()}));
 }
 
@@ -495,7 +497,8 @@ void Manifold::Impl::CreateAndFixHalfedges(const VecDH<glm::ivec3>& triVerts) {
   halfedge_.resize(0);
   halfedge_.resize(3 * numTri);
   VecDH<TmpEdge> edge(3 * numTri);
-  thrust::for_each_n(thrust::device, zip(countAt(0), triVerts.begin()), numTri,
+  auto policy = autoPolicy(numTri);
+  for_each_n(policy, zip(countAt(0), triVerts.begin()), numTri,
                      Tri2Halfedges({halfedge_.ptrD(), edge.ptrD()}));
   // Stable sort is required here so that halfedges from the same face are
   // paired together (the triangles were created in face order). In some
@@ -503,7 +506,7 @@ void Manifold::Impl::CreateAndFixHalfedges(const VecDH<glm::ivec3>& triVerts) {
   // two different faces, causing this edge to not be 2-manifold. We detect this
   // and fix it by swapping one of the identical edges, so it is important that
   // we have the edges paired according to their face.
-  thrust::stable_sort(thrust::device, edge.begin(), edge.end());
+  stable_sort(policy, edge.begin(), edge.end());
   thrust::for_each_n(thrust::host, countAt(0), halfedge_.size() / 2,
                      LinkHalfedges({halfedge_.ptrH(), edge.cptrH()}));
   thrust::for_each(thrust::host, countAt(1), countAt(halfedge_.size() / 2),
@@ -535,14 +538,15 @@ void Manifold::Impl::ApplyTransform() const {
  */
 void Manifold::Impl::ApplyTransform() {
   if (transform_ == glm::mat4x3(1.0f)) return;
-  thrust::for_each(thrust::device, vertPos_.begin(), vertPos_.end(),
+  auto policy = autoPolicy(vertPos_.size());
+  for_each(policy, vertPos_.begin(), vertPos_.end(),
                    Transform4x3({transform_}));
 
   glm::mat3 normalTransform =
       glm::inverse(glm::transpose(glm::mat3(transform_)));
-  thrust::for_each(thrust::device, faceNormal_.begin(), faceNormal_.end(),
+  for_each(policy, faceNormal_.begin(), faceNormal_.end(),
                    TransformNormals({normalTransform}));
-  thrust::for_each(thrust::device, vertNormal_.begin(), vertNormal_.end(),
+  for_each(policy, vertNormal_.begin(), vertNormal_.end(),
                    TransformNormals({normalTransform}));
   // This optimization does a cheap collider update if the transform is
   // axis-aligned.
@@ -585,17 +589,18 @@ void Manifold::Impl::SetPrecision(float minPrecision) {
  */
 void Manifold::Impl::CalculateNormals() {
   vertNormal_.resize(NumVert());
-  thrust::fill(thrust::device, vertNormal_.begin(), vertNormal_.end(), glm::vec3(0));
+  auto policy = autoPolicy(NumTri());
+  fill(policy, vertNormal_.begin(), vertNormal_.end(), glm::vec3(0));
   bool calculateTriNormal = false;
   if (faceNormal_.size() != NumTri()) {
     faceNormal_.resize(NumTri());
     calculateTriNormal = true;
   }
-  thrust::for_each_n(
-      thrust::device, zip(faceNormal_.begin(), countAt(0)), NumTri(),
+  for_each_n(
+      policy, zip(faceNormal_.begin(), countAt(0)), NumTri(),
       AssignNormals({vertNormal_.ptrD(), vertPos_.cptrD(), halfedge_.cptrD(),
                      precision_, calculateTriNormal}));
-  thrust::for_each(thrust::device, vertNormal_.begin(), vertNormal_.end(), Normalize());
+  for_each(policy, vertNormal_.begin(), vertNormal_.end(), Normalize());
 }
 
 /**
@@ -617,28 +622,27 @@ void Manifold::Impl::UpdateMeshIDs(VecDH<int> &meshIDs, VecDH<int> &originalIDs,
                                    int startTri, int n, int startID) {
   if (n == -1)
     n = meshRelation_.triBary.size();
-  thrust::sort_by_key(thrust::host, meshIDs.begin(), meshIDs.end(),
-                      originalIDs.begin());
+  sort_by_key(autoPolicy(n), meshIDs.begin(), meshIDs.end(), originalIDs.begin());
   constexpr int kOccurred = 1 << 30;
   VecDH<int> error(1, -1);
   const int numMesh = meshIDs.size();
   const int *meshIDsPtr = meshIDs.cptrD();
   int *originalPtr = originalIDs.ptrD();
   int *errorPtr = error.ptrD();
-  thrust::for_each(thrust::device,
-                   meshRelation_.triBary.begin() + startTri,
-                   meshRelation_.triBary.begin() + startTri + n,
-                   [=] __host__ __device__(BaryRef & b) {
-                     int index =
-                         thrust::lower_bound(meshIDsPtr, meshIDsPtr + numMesh,
-                                             b.meshID) -
-                         meshIDsPtr;
-                     if (index >= numMesh || meshIDsPtr[index] != b.meshID) {
-                       *errorPtr = b.meshID;
-                     }
-                     b.meshID = index + startID;
-                     originalPtr[index] |= kOccurred;
-                   });
+  for_each(autoPolicy(n),
+           meshRelation_.triBary.begin() + startTri,
+           meshRelation_.triBary.begin() + startTri + n,
+           [=] __host__ __device__(BaryRef & b) {
+             int index =
+                 thrust::lower_bound(meshIDsPtr, meshIDsPtr + numMesh,
+                                     b.meshID) -
+                 meshIDsPtr;
+             if (index >= numMesh || meshIDsPtr[index] != b.meshID) {
+               *errorPtr = b.meshID;
+             }
+             b.meshID = index + startID;
+             originalPtr[index] |= kOccurred;
+           });
 
   if (error[0] != -1) {
     std::stringstream ss;
@@ -663,12 +667,13 @@ SparseIndices Manifold::Impl::EdgeCollisions(const Impl& Q) const {
   VecDH<TmpEdge> edges = CreateTmpEdges(Q.halfedge_);
   const int numEdge = edges.size();
   VecDH<Box> QedgeBB(numEdge);
-  thrust::for_each_n(thrust::device, zip(QedgeBB.begin(), edges.cbegin()), numEdge,
+  auto policy = autoPolicy(numEdge);
+  for_each_n(policy, zip(QedgeBB.begin(), edges.cbegin()), numEdge,
                      EdgeBox({Q.vertPos_.cptrD()}));
 
   SparseIndices q1p2 = collider_.Collisions(QedgeBB);
 
-  thrust::for_each(thrust::device, q1p2.begin(0), q1p2.end(0), ReindexEdge({edges.cptrD()}));
+  for_each(policy, q1p2.begin(0), q1p2.end(0), ReindexEdge({edges.cptrD()}));
   return q1p2;
 }
 
diff --git a/manifold/src/manifold.cpp b/manifold/src/manifold.cpp
index 04c02eeed..8e0a2edee 100644
--- a/manifold/src/manifold.cpp
+++ b/manifold/src/manifold.cpp
@@ -14,6 +14,7 @@
 
 #include "boolean3.h"
 #include "impl.h"
+#include "par.h"
 
 namespace {
 using namespace manifold;
@@ -121,6 +122,7 @@ Mesh Manifold::GetMesh() const {
                                 pImpl_->halfedgeTangent_.end());
 
   result.triVerts.resize(NumTri());
+  // note that `triVerts` is `std::vector`, so we cannot use thrust::device
   thrust::for_each_n(thrust::host, zip(result.triVerts.begin(), countAt(0)), NumTri(),
                      MakeTri({pImpl_->halfedge_.cptrH()}));
 
diff --git a/manifold/src/properties.cpp b/manifold/src/properties.cpp
index 3862fec2e..29b30e07a 100644
--- a/manifold/src/properties.cpp
+++ b/manifold/src/properties.cpp
@@ -18,6 +18,7 @@
 #include <limits>
 
 #include "impl.h"
+#include "par.h"
 
 namespace {
 using namespace manifold;
@@ -214,12 +215,13 @@ namespace manifold {
  */
 bool Manifold::Impl::IsManifold() const {
   if (halfedge_.size() == 0) return true;
-  bool isManifold = thrust::all_of(thrust::device, countAt(0), countAt(halfedge_.size()),
+  auto policy = autoPolicy(halfedge_.size());
+  bool isManifold = all_of(policy, countAt(0), countAt(halfedge_.size()),
                                    CheckManifold({halfedge_.cptrD()}));
 
   VecDH<Halfedge> halfedge(halfedge_);
-  thrust::sort(thrust::device, halfedge.begin(), halfedge.end());
-  isManifold &= thrust::all_of(thrust::device, countAt(0), countAt(2 * NumEdge() - 1),
+  sort(policy, halfedge.begin(), halfedge.end());
+  isManifold &= all_of(policy, countAt(0), countAt(2 * NumEdge() - 1),
                                NoDuplicates({halfedge.cptrD()}));
   return isManifold;
 }
@@ -229,7 +231,7 @@ bool Manifold::Impl::IsManifold() const {
  */
 bool Manifold::Impl::MatchesTriNormals() const {
   if (halfedge_.size() == 0 || faceNormal_.size() != NumTri()) return true;
-  return thrust::all_of(thrust::device, countAt(0), countAt(NumTri()),
+  return all_of(autoPolicy(NumTri()), countAt(0), countAt(NumTri()),
                         CheckCCW({halfedge_.cptrD(), vertPos_.cptrD(),
                                   faceNormal_.cptrD(), 2 * precision_}));
 }
@@ -239,7 +241,7 @@ bool Manifold::Impl::MatchesTriNormals() const {
  */
 int Manifold::Impl::NumDegenerateTris() const {
   if (halfedge_.size() == 0 || faceNormal_.size() != NumTri()) return true;
-  return thrust::count_if(thrust::device, countAt(0), countAt(NumTri()),
+  return count_if(autoPolicy(NumTri()), countAt(0), countAt(NumTri()),
                           CheckCCW({halfedge_.cptrD(), vertPos_.cptrD(),
                                     faceNormal_.cptrD(), -1 * precision_ / 2}));
 }
@@ -247,8 +249,8 @@ int Manifold::Impl::NumDegenerateTris() const {
 Properties Manifold::Impl::GetProperties() const {
   if (IsEmpty()) return {0, 0};
   ApplyTransform();
-  thrust::pair<float, float> areaVolume = thrust::transform_reduce(
-      thrust::device, countAt(0), countAt(NumTri()),
+  auto areaVolume = transform_reduce<thrust::pair<float, float>>(
+      autoPolicy(NumTri()), countAt(0), countAt(NumTri()),
       FaceAreaVolume({halfedge_.cptrD(), vertPos_.cptrD(), precision_}),
       thrust::make_pair(0.0f, 0.0f), SumPair());
   return {areaVolume.first, areaVolume.second};
@@ -262,26 +264,27 @@ Curvature Manifold::Impl::GetCurvature() const {
   VecDH<float> vertGaussianCurvature(NumVert(), glm::two_pi<float>());
   VecDH<float> vertArea(NumVert(), 0);
   VecDH<float> degree(NumVert(), 0);
-  thrust::for_each(
-      thrust::device, countAt(0), countAt(NumTri()),
+  auto policy = autoPolicy(NumTri());
+  for_each(
+      policy, countAt(0), countAt(NumTri()),
       CurvatureAngles({vertMeanCurvature.ptrD(), vertGaussianCurvature.ptrD(),
                        vertArea.ptrD(), degree.ptrD(), halfedge_.cptrD(),
                        vertPos_.cptrD(), faceNormal_.cptrD()}));
-  thrust::for_each_n(
-      thrust::device, zip(vertMeanCurvature.begin(), vertGaussianCurvature.begin(),
+  for_each_n(
+      policy, zip(vertMeanCurvature.begin(), vertGaussianCurvature.begin(),
           vertArea.begin(), degree.begin()),
       NumVert(), NormalizeCurvature());
   result.minMeanCurvature =
-      thrust::reduce(thrust::device, vertMeanCurvature.begin(), vertMeanCurvature.end(),
+      reduce<float>(policy, vertMeanCurvature.begin(), vertMeanCurvature.end(),
                      std::numeric_limits<float>::infinity(), thrust::minimum<float>());
   result.maxMeanCurvature =
-      thrust::reduce(thrust::device, vertMeanCurvature.begin(), vertMeanCurvature.end(),
+      reduce<float>(policy, vertMeanCurvature.begin(), vertMeanCurvature.end(),
                      -std::numeric_limits<float>::infinity(), thrust::maximum<float>());
-  result.minGaussianCurvature = thrust::reduce(
-      thrust::device, vertGaussianCurvature.begin(), vertGaussianCurvature.end(), std::numeric_limits<float>::infinity(),
+  result.minGaussianCurvature = reduce<float>(
+      policy, vertGaussianCurvature.begin(), vertGaussianCurvature.end(), std::numeric_limits<float>::infinity(),
       thrust::minimum<float>());
-  result.maxGaussianCurvature = thrust::reduce(
-      thrust::device, vertGaussianCurvature.begin(), vertGaussianCurvature.end(),
+  result.maxGaussianCurvature = reduce<float>(
+      policy, vertGaussianCurvature.begin(), vertGaussianCurvature.end(),
       -std::numeric_limits<float>::infinity(), thrust::maximum<float>());
   result.vertMeanCurvature.insert(result.vertMeanCurvature.end(),
                                   vertMeanCurvature.begin(),
@@ -298,9 +301,10 @@ Curvature Manifold::Impl::GetCurvature() const {
  * range for Morton code calculation.
  */
 void Manifold::Impl::CalculateBBox() {
-  bBox_.min = thrust::reduce(thrust::device, vertPos_.begin(), vertPos_.end(),
+  auto policy = autoPolicy(NumVert());
+  bBox_.min = reduce<glm::vec3>(policy, vertPos_.begin(), vertPos_.end(),
                              glm::vec3(std::numeric_limits<float>::infinity()), PosMin());
-  bBox_.max = thrust::reduce(thrust::device, vertPos_.begin(), vertPos_.end(),
+  bBox_.max = reduce<glm::vec3>(policy, vertPos_.begin(), vertPos_.end(),
                              glm::vec3(-std::numeric_limits<float>::infinity()), PosMax());
 }
 }  // namespace manifold
diff --git a/manifold/src/shared.h b/manifold/src/shared.h
index f78b7ada8..9b21467af 100644
--- a/manifold/src/shared.h
+++ b/manifold/src/shared.h
@@ -18,6 +18,7 @@
 #include <glm/gtx/compatibility.hpp>
 #include "utils.h"
 #include "vec_dh.h"
+#include "par.h"
 
 namespace manifold {
 
@@ -149,9 +150,9 @@ struct TmpInvalid {
 
 VecDH<TmpEdge> inline CreateTmpEdges(const VecDH<Halfedge>& halfedge) {
   VecDH<TmpEdge> edges(halfedge.size());
-  thrust::for_each_n(thrust::device, zip(edges.begin(), halfedge.begin(), countAt(0)),
+  for_each_n(autoPolicy(edges.size()), zip(edges.begin(), halfedge.begin(), countAt(0)),
                      edges.size(), Halfedge2Tmp());
-  int numEdge = thrust::remove_if(thrust::device, edges.begin(), edges.end(), TmpInvalid()) -
+  int numEdge = remove_if<decltype(edges.begin())>(autoPolicy(edges.size()), edges.begin(), edges.end(), TmpInvalid()) -
                 edges.begin();
   ALWAYS_ASSERT(numEdge == halfedge.size() / 2, topologyErr, "Not oriented!");
   edges.resize(numEdge);
diff --git a/manifold/src/smoothing.cpp b/manifold/src/smoothing.cpp
index d33d63be0..48bebfc67 100644
--- a/manifold/src/smoothing.cpp
+++ b/manifold/src/smoothing.cpp
@@ -15,6 +15,7 @@
 #include <map>
 
 #include "impl.h"
+#include "par.h"
 
 namespace {
 using namespace manifold;
@@ -356,7 +357,7 @@ void Manifold::Impl::CreateTangents(
   const int numHalfedge = halfedge_.size();
   halfedgeTangent_.resize(numHalfedge);
 
-  thrust::for_each_n(thrust::device, zip(halfedgeTangent_.begin(), halfedge_.cbegin()),
+  for_each_n(autoPolicy(numHalfedge), zip(halfedgeTangent_.begin(), halfedge_.cbegin()),
                      numHalfedge,
                      SmoothBezier({vertPos_.cptrD(), faceNormal_.cptrD(),
                                    vertNormal_.cptrD(), halfedge_.cptrD()}));
@@ -476,12 +477,13 @@ Manifold::Impl::MeshRelationD Manifold::Impl::Subdivide(int n) {
 
   VecDH<TmpEdge> edges = CreateTmpEdges(halfedge_);
   VecDH<int> half2Edge(2 * numEdge);
-  thrust::for_each_n(thrust::device, zip(countAt(0), edges.begin()), numEdge,
+  auto policy = autoPolicy(numEdge);
+  for_each_n(policy, zip(countAt(0), edges.begin()), numEdge,
                      ReindexHalfedge({half2Edge.ptrD()}));
-  thrust::for_each_n(thrust::device, zip(countAt(0), edges.begin()), numEdge,
+  for_each_n(policy, zip(countAt(0), edges.begin()), numEdge,
                      EdgeVerts({vertPos_.ptrD(), numVert, n}));
-  thrust::for_each_n(
-      thrust::device, zip(countAt(0), oldMeshRelation.triBary.begin()), numTri,
+  for_each_n(
+      policy, zip(countAt(0), oldMeshRelation.triBary.begin()), numTri,
       InteriorVerts({vertPos_.ptrD(), relation.barycentric.ptrD(),
                      relation.triBary.ptrD(), meshRelation_.barycentric.ptrD(),
                      meshRelation_.triBary.ptrD(),
@@ -489,7 +491,7 @@ Manifold::Impl::MeshRelationD Manifold::Impl::Subdivide(int n) {
                      halfedge_.ptrD()}));
   // Create subtriangles
   VecDH<glm::ivec3> triVerts(n * n * numTri);
-  thrust::for_each_n(thrust::device, countAt(0), numTri,
+  for_each_n(policy, countAt(0), numTri,
                      SplitTris({triVerts.ptrD(), halfedge_.cptrD(),
                                 half2Edge.cptrD(), numVert, triVertStart, n}));
   CreateHalfedges(triVerts);
@@ -503,13 +505,14 @@ void Manifold::Impl::Refine(int n) {
   if (old.halfedgeTangent_.size() == old.halfedge_.size()) {
     VecDH<Barycentric> vertBary(NumVert());
     VecDH<int> lock(NumVert(), 0);
-    thrust::for_each_n(
-        thrust::device, zip(relation.triBary.begin(), countAt(0)), NumTri(),
+    auto policy = autoPolicy(NumTri());
+    for_each_n(
+        policy, zip(relation.triBary.begin(), countAt(0)), NumTri(),
         TriBary2Vert({vertBary.ptrD(), lock.ptrD(),
                       relation.barycentric.cptrD(), halfedge_.cptrD()}));
 
-    thrust::for_each_n(
-        thrust::device, zip(vertPos_.begin(), vertBary.begin()), NumVert(),
+    for_each_n(
+        policy, zip(vertPos_.begin(), vertBary.begin()), NumVert(),
         InterpTri({old.halfedge_.cptrD(), old.halfedgeTangent_.cptrD(),
                    old.vertPos_.cptrD()}));
   }
diff --git a/manifold/src/sort.cpp b/manifold/src/sort.cpp
index 69541c60d..aba8f9e8b 100644
--- a/manifold/src/sort.cpp
+++ b/manifold/src/sort.cpp
@@ -15,6 +15,7 @@
 #include <thrust/sequence.h>
 
 #include "impl.h"
+#include "par.h"
 
 namespace {
 using namespace manifold;
@@ -120,9 +121,9 @@ struct Reindex {
 
 template <typename T>
 void Permute(VecDH<T>& inOut, const VecDH<int>& new2Old) {
-  VecDH<T> tmp(inOut);
+  VecDH<T> tmp(std::move(inOut));
   inOut.resize(new2Old.size());
-  thrust::gather(new2Old.begin(), new2Old.end(), tmp.begin(),
+  gather(autoPolicy(new2Old.size()), new2Old.begin(), new2Old.end(), tmp.begin(),
                  inOut.begin());
 }
 
@@ -187,7 +188,7 @@ void Manifold::Impl::Finish() {
                 "Not an even number of faces after sorting faces!");
   Halfedge extrema = {0, 0, 0, 0};
   extrema =
-      thrust::reduce(thrust::device, halfedge_.begin(), halfedge_.end(), extrema, Extrema());
+      reduce<Halfedge>(autoPolicy(halfedge_.size()), halfedge_.begin(), halfedge_.end(), extrema, Extrema());
 
   ALWAYS_ASSERT(extrema.startVert >= 0, topologyErr,
                 "Vertex index is negative!");
@@ -210,12 +211,13 @@ void Manifold::Impl::Finish() {
  */
 void Manifold::Impl::SortVerts() {
   VecDH<uint32_t> vertMorton(NumVert());
-  thrust::for_each_n(thrust::device, zip(vertMorton.begin(), vertPos_.cbegin()), NumVert(),
+  auto policy = autoPolicy(NumVert());
+  for_each_n(policy, zip(vertMorton.begin(), vertPos_.cbegin()), NumVert(),
                      Morton({bBox_}));
 
   VecDH<int> vertNew2Old(NumVert());
-  thrust::sequence(thrust::device, vertNew2Old.begin(), vertNew2Old.end());
-  thrust::sort_by_key(thrust::device, vertMorton.begin(), vertMorton.end(),
+  sequence(policy, vertNew2Old.begin(), vertNew2Old.end());
+  sort_by_key(policy, vertMorton.begin(), vertMorton.end(),
                       zip(vertPos_.begin(), vertNew2Old.begin()));
 
   ReindexVerts(vertNew2Old, NumVert());
@@ -223,7 +225,7 @@ void Manifold::Impl::SortVerts() {
   // Verts were flagged for removal with NaNs and assigned kNoCode to sort them
   // to the end, which allows them to be removed.
   const int newNumVert =
-      thrust::find(thrust::device, vertMorton.begin(), vertMorton.end(), kNoCode) -
+      find<decltype(vertMorton.begin())>(policy, vertMorton.begin(), vertMorton.end(), kNoCode) -
       vertMorton.begin();
   vertPos_.resize(newNumVert);
 }
@@ -236,9 +238,9 @@ void Manifold::Impl::SortVerts() {
 void Manifold::Impl::ReindexVerts(const VecDH<int>& vertNew2Old,
                                   int oldNumVert) {
   VecDH<int> vertOld2New(oldNumVert);
-  thrust::scatter(thrust::device, countAt(0), countAt(NumVert()), vertNew2Old.begin(),
+  scatter(autoPolicy(oldNumVert), countAt(0), countAt(NumVert()), vertNew2Old.begin(),
                   vertOld2New.begin());
-  thrust::for_each(thrust::device, halfedge_.begin(), halfedge_.end(),
+  for_each(autoPolicy(oldNumVert), halfedge_.begin(), halfedge_.end(),
                    Reindex({vertOld2New.cptrD()}));
 }
 
@@ -251,8 +253,8 @@ void Manifold::Impl::GetFaceBoxMorton(VecDH<Box>& faceBox,
                                       VecDH<uint32_t>& faceMorton) const {
   faceBox.resize(NumTri());
   faceMorton.resize(NumTri());
-  thrust::for_each_n(
-      thrust::device, zip(faceMorton.begin(), faceBox.begin(), countAt(0)), NumTri(),
+  for_each_n(
+      autoPolicy(NumTri()), zip(faceMorton.begin(), faceBox.begin(), countAt(0)), NumTri(),
       FaceMortonBox({halfedge_.cptrD(), vertPos_.cptrD(), bBox_}));
 }
 
@@ -263,15 +265,16 @@ void Manifold::Impl::GetFaceBoxMorton(VecDH<Box>& faceBox,
 void Manifold::Impl::SortFaces(VecDH<Box>& faceBox,
                                VecDH<uint32_t>& faceMorton) {
   VecDH<int> faceNew2Old(NumTri());
-  thrust::sequence(thrust::device, faceNew2Old.begin(), faceNew2Old.end());
+  auto policy = autoPolicy(faceNew2Old.size());
+  sequence(policy, faceNew2Old.begin(), faceNew2Old.end());
 
-  thrust::sort_by_key(thrust::device, faceMorton.begin(), faceMorton.end(),
+  sort_by_key(policy, faceMorton.begin(), faceMorton.end(),
                       zip(faceBox.begin(), faceNew2Old.begin()));
 
   // Tris were flagged for removal with pairedHalfedge = -1 and assigned kNoCode
   // to sort them to the end, which allows them to be removed.
   const int newNumTri =
-      thrust::find(thrust::device, faceMorton.begin(), faceMorton.end(), kNoCode) -
+      find<decltype(faceMorton.begin())>(policy, faceMorton.begin(), faceMorton.end(), kNoCode) -
       faceMorton.begin();
   faceBox.resize(newNumTri);
   faceMorton.resize(newNumTri);
@@ -295,13 +298,14 @@ void Manifold::Impl::GatherFaces(const VecDH<int>& faceNew2Old) {
   VecDH<Halfedge> oldHalfedge(std::move(halfedge_));
   VecDH<glm::vec4> oldHalfedgeTangent(std::move(halfedgeTangent_));
   VecDH<int> faceOld2New(oldHalfedge.size() / 3);
-  thrust::scatter(thrust::device, countAt(0), countAt(numTri), faceNew2Old.begin(),
+  auto policy = autoPolicy(numTri);
+  scatter(policy, countAt(0), countAt(numTri), faceNew2Old.begin(),
                   faceOld2New.begin());
 
   halfedge_.resize(3 * numTri);
   if (oldHalfedgeTangent.size() != 0) halfedgeTangent_.resize(3 * numTri);
-  thrust::for_each_n(
-      thrust::device, countAt(0), numTri,
+  for_each_n(
+      policy, countAt(0), numTri,
       ReindexFace({halfedge_.ptrD(), halfedgeTangent_.ptrD(),
                    oldHalfedge.cptrD(), oldHalfedgeTangent.cptrD(),
                    faceNew2Old.cptrD(), faceOld2New.cptrD()}));
@@ -311,25 +315,26 @@ void Manifold::Impl::GatherFaces(const Impl& old,
                                  const VecDH<int>& faceNew2Old) {
   const int numTri = faceNew2Old.size();
   meshRelation_.triBary.resize(numTri);
-  thrust::gather(thrust::device, faceNew2Old.begin(), faceNew2Old.end(),
+  auto policy = autoPolicy(numTri);
+  gather(policy, faceNew2Old.begin(), faceNew2Old.end(),
                  old.meshRelation_.triBary.begin(),
                  meshRelation_.triBary.begin());
   meshRelation_.barycentric = old.meshRelation_.barycentric;
 
   if (old.faceNormal_.size() == old.NumTri()) {
     faceNormal_.resize(numTri);
-    thrust::gather(thrust::device, faceNew2Old.begin(), faceNew2Old.end(),
+    gather(policy, faceNew2Old.begin(), faceNew2Old.end(),
                    old.faceNormal_.begin(), faceNormal_.begin());
   }
 
   VecDH<int> faceOld2New(old.NumTri());
-  thrust::scatter(thrust::device, countAt(0), countAt(numTri), faceNew2Old.begin(),
+  scatter(policy, countAt(0), countAt(numTri), faceNew2Old.begin(),
                   faceOld2New.begin());
 
   halfedge_.resize(3 * numTri);
   if (old.halfedgeTangent_.size() != 0) halfedgeTangent_.resize(3 * numTri);
-  thrust::for_each_n(
-      thrust::device, countAt(0), numTri,
+  for_each_n(
+      policy, countAt(0), numTri,
       ReindexFace({halfedge_.ptrD(), halfedgeTangent_.ptrD(),
                    old.halfedge_.cptrD(), old.halfedgeTangent_.cptrD(),
                    faceNew2Old.cptrD(), faceOld2New.cptrD()}));
diff --git a/test/mesh_test.cpp b/test/mesh_test.cpp
index 82134888c..e4efdb9c5 100644
--- a/test/mesh_test.cpp
+++ b/test/mesh_test.cpp
@@ -759,7 +759,7 @@ TEST(Boolean, Gyroid) {
 
   EXPECT_TRUE(result.IsManifold());
   EXPECT_TRUE(result.MatchesTriNormals());
-  EXPECT_LE(result.NumDegenerateTris(), 42);
+  EXPECT_LE(result.NumDegenerateTris(), 43);
   EXPECT_EQ(result.Decompose().size(), 1);
   auto prop = result.GetProperties();
   EXPECT_NEAR(prop.volume, 7692, 1);
diff --git a/utilities/CMakeLists.txt b/utilities/CMakeLists.txt
index 458e65dc8..cd27325b6 100644
--- a/utilities/CMakeLists.txt
+++ b/utilities/CMakeLists.txt
@@ -1,9 +1,20 @@
 project (utilities)
 
-add_library(${PROJECT_NAME} INTERFACE)
+add_library(${PROJECT_NAME} src/detect_cuda.cpp)
 
-target_include_directories(${PROJECT_NAME} INTERFACE ${THRUST_INC_DIR} ${GLM_INC_DIR})
-target_include_directories(${PROJECT_NAME}
-    INTERFACE
-        ${PROJECT_SOURCE_DIR}/include
-)
+target_include_directories(${PROJECT_NAME} PUBLIC ${THRUST_INC_DIR} ${GLM_INC_DIR})
+if(MANIFOLD_USE_CUDA)
+    set_source_files_properties(src/detect_cuda.cpp PROPERTIES LANGUAGE CUDA)
+    set_property(TARGET ${PROJECT_NAME} PROPERTY CUDA_ARCHITECTURES 61)
+    target_compile_options(${PROJECT_NAME} 
+        PRIVATE ${MANIFOLD_NVCC_FLAGS}
+    )
+    target_compile_options(${PROJECT_NAME} 
+        PRIVATE "$<$<CONFIG:RELEASE>:${MANIFOLD_NVCC_RELEASE_FLAGS}>" "$<$<CONFIG:DEBUG>:${MANIFOLD_NVCC_DEBUG_FLAGS}>"
+    )
+else()
+    target_compile_options(${PROJECT_NAME} 
+        PRIVATE ${MANIFOLD_FLAGS}
+    )
+endif()
+target_include_directories(${PROJECT_NAME} INTERFACE ${PROJECT_SOURCE_DIR}/include)
diff --git a/utilities/include/par.h b/utilities/include/par.h
new file mode 100644
index 000000000..a101f54a0
--- /dev/null
+++ b/utilities/include/par.h
@@ -0,0 +1,154 @@
+#pragma once
+#include <thrust/binary_search.h>
+#include <thrust/count.h>
+#include <thrust/gather.h>
+#include <thrust/logical.h>
+#include <thrust/remove.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#include <thrust/uninitialized_copy.h>
+#include <thrust/unique.h>
+#include <thrust/execution_policy.h>
+
+#include <thrust/system/cpp/execution_policy.h>
+
+#if MANIFOLD_PAR == 'O'
+#include <thrust/system/omp/execution_policy.h>
+#define MANIFOLD_PAR_NS omp
+#elif MANIFOLD_PAR == 'T'
+#include <thrust/system/tbb/execution_policy.h>
+#define MANIFOLD_PAR_NS tbb
+#else
+#define MANIFOLD_PAR_NS cpp
+#endif
+
+#ifdef MANIFOLD_USE_CUDA
+#include <thrust/system/cuda/execution_policy.h>
+#endif
+
+namespace manifold {
+
+void check_cuda_available();
+#ifdef MANIFOLD_USE_CUDA
+extern int CUDA_ENABLED;
+#else
+constexpr int CUDA_ENABLED = 0;
+#endif
+
+enum class ExecutionPolicy {
+  ParUnseq,
+  Par,
+  Seq,
+};
+
+constexpr ExecutionPolicy ParUnseq = ExecutionPolicy::ParUnseq;
+constexpr ExecutionPolicy Par = ExecutionPolicy::Par;
+constexpr ExecutionPolicy Seq = ExecutionPolicy::Seq;
+
+// ExecutionPolicy:
+// - Sequential for small workload,
+// - Parallel (CPU) for medium workload,
+// - GPU for large workload if available.
+inline ExecutionPolicy autoPolicy(int size) {
+  // some random numbers
+  if (size <= (1 << 12)) {
+    return Seq;
+  }
+  if (size <= (1 << 16) || CUDA_ENABLED != 1) {
+    return Par;
+  }
+  return ParUnseq;
+}
+
+#ifdef MANIFOLD_USE_CUDA
+#define THRUST_DYNAMIC_BACKEND_VOID(NAME)                                      \
+  template <typename... Args>                                                  \
+  void NAME(ExecutionPolicy policy, Args... args) {                            \
+    switch (policy) {                                                          \
+    case ExecutionPolicy::ParUnseq:                                            \
+      thrust::NAME(thrust::cuda::par, args...);                                \
+      break;                                                                   \
+    case ExecutionPolicy::Par:                                                 \
+      thrust::NAME(thrust::MANIFOLD_PAR_NS::par, args...);                     \
+      break;                                                                   \
+    case ExecutionPolicy::Seq:                                                 \
+      thrust::NAME(thrust::cpp::par, args...);                                 \
+      break;                                                                   \
+    }                                                                          \
+  }
+#define THRUST_DYNAMIC_BACKEND(NAME, RET)                                      \
+  template <typename Ret = RET, typename... Args>                              \
+  Ret NAME(ExecutionPolicy policy, Args... args) {                             \
+    switch (policy) {                                                          \
+    case ExecutionPolicy::ParUnseq:                                            \
+      return thrust::NAME(thrust::cuda::par, args...);                         \
+    case ExecutionPolicy::Par:                                                 \
+      return thrust::NAME(thrust::MANIFOLD_PAR_NS::par, args...);              \
+    case ExecutionPolicy::Seq:                                                 \
+      break;                                                                   \
+    }                                                                          \
+    return thrust::NAME(thrust::cpp::par, args...);                            \
+  }
+#else
+#define THRUST_DYNAMIC_BACKEND_VOID(NAME)                                      \
+  template <typename... Args>                                                  \
+  void NAME(ExecutionPolicy policy, Args... args) {                            \
+    switch (policy) {                                                          \
+    case ExecutionPolicy::ParUnseq:                                            \
+    case ExecutionPolicy::Par:                                                 \
+      thrust::NAME(thrust::MANIFOLD_PAR_NS::par, args...);                     \
+      break;                                                                   \
+    case ExecutionPolicy::Seq:                                                 \
+      thrust::NAME(thrust::cpp::par, args...);                                 \
+      break;                                                                   \
+    }                                                                          \
+  }
+
+#define THRUST_DYNAMIC_BACKEND(NAME, RET)                                      \
+  template <typename Ret = RET, typename... Args>                              \
+  Ret NAME(ExecutionPolicy policy, Args... args) {                             \
+    switch (policy) {                                                          \
+    case ExecutionPolicy::ParUnseq:                                            \
+    case ExecutionPolicy::Par:                                                 \
+      return thrust::NAME(thrust::MANIFOLD_PAR_NS::par, args...);              \
+    case ExecutionPolicy::Seq:                                                 \
+      break;                                                                   \
+    }                                                                          \
+    return thrust::NAME(thrust::cpp::par, args...);                            \
+  }
+#endif
+
+THRUST_DYNAMIC_BACKEND_VOID(gather)
+THRUST_DYNAMIC_BACKEND_VOID(scatter)
+THRUST_DYNAMIC_BACKEND_VOID(for_each)
+THRUST_DYNAMIC_BACKEND_VOID(for_each_n)
+THRUST_DYNAMIC_BACKEND_VOID(sort)
+THRUST_DYNAMIC_BACKEND_VOID(stable_sort)
+THRUST_DYNAMIC_BACKEND_VOID(fill)
+THRUST_DYNAMIC_BACKEND_VOID(sequence)
+THRUST_DYNAMIC_BACKEND_VOID(sort_by_key)
+THRUST_DYNAMIC_BACKEND_VOID(copy)
+THRUST_DYNAMIC_BACKEND_VOID(transform)
+THRUST_DYNAMIC_BACKEND_VOID(inclusive_scan)
+THRUST_DYNAMIC_BACKEND_VOID(exclusive_scan)
+THRUST_DYNAMIC_BACKEND_VOID(uninitialized_fill)
+THRUST_DYNAMIC_BACKEND_VOID(uninitialized_copy)
+
+THRUST_DYNAMIC_BACKEND(all_of, bool)
+THRUST_DYNAMIC_BACKEND(is_sorted, bool)
+THRUST_DYNAMIC_BACKEND(reduce, void)
+THRUST_DYNAMIC_BACKEND(count_if, int)
+THRUST_DYNAMIC_BACKEND(binary_search, bool)
+// void implies that the user have to specify the return type in the template
+// argument, as we are unable to deduce it
+THRUST_DYNAMIC_BACKEND(remove, void)
+THRUST_DYNAMIC_BACKEND(copy_if, void)
+THRUST_DYNAMIC_BACKEND(remove_if, void)
+THRUST_DYNAMIC_BACKEND(unique, void)
+THRUST_DYNAMIC_BACKEND(find, void)
+THRUST_DYNAMIC_BACKEND(reduce_by_key, void)
+THRUST_DYNAMIC_BACKEND(transform_reduce, void)
+THRUST_DYNAMIC_BACKEND(lower_bound, void)
+THRUST_DYNAMIC_BACKEND(gather_if, void)
+
+} // namespace manifold
diff --git a/utilities/include/sparse.h b/utilities/include/sparse.h
index 9a7132e8b..ce1b21509 100644
--- a/utilities/include/sparse.h
+++ b/utilities/include/sparse.h
@@ -23,6 +23,7 @@
 #include "structs.h"
 #include "utils.h"
 #include "vec_dh.h"
+#include "par.h"
 
 namespace manifold {
 
@@ -65,7 +66,7 @@ class SparseIndices {
   int size() const { return p.size(); }
   void SwapPQ() { p.swap(q); }
 
-  void Sort() { thrust::sort(beginPQ(), endPQ()); }
+  void Sort() { sort(autoPolicy(size()), beginPQ(), endPQ()); }
 
   void Resize(int size) {
     p.resize(size, -1);
@@ -74,7 +75,7 @@ class SparseIndices {
 
   void Unique() {
     Sort();
-    int newSize = thrust::unique(beginPQ(), endPQ()) - beginPQ();
+    int newSize = unique<decltype(beginPQ())>(autoPolicy(size()), beginPQ(), endPQ()) - beginPQ();
     Resize(newSize);
   }
 
@@ -89,7 +90,7 @@ class SparseIndices {
                   "Different number of values than indicies!");
     auto zBegin = zip(S.begin(), begin(false), begin(true));
     auto zEnd = zip(S.end(), end(false), end(true));
-    size_t size = thrust::remove_if(zBegin, zEnd, firstZero()) - zBegin;
+    size_t size = remove_if<decltype(zBegin)>(autoPolicy(S.size()), zBegin, zEnd, firstZero()) - zBegin;
     S.resize(size, -1);
     p.resize(size, -1);
     q.resize(size, -1);
@@ -122,9 +123,7 @@ class SparseIndices {
                   "Different number of values than indicies!");
     auto zBegin = zip(v.begin(), x.begin(), begin(false), begin(true));
     auto zEnd = zip(v.end(), x.end(), end(false), end(true));
-    size_t size =
-        thrust::remove_if(thrust::device, zBegin, zEnd, firstNonFinite<T>()) -
-        zBegin;
+    size_t size = remove_if<decltype(zBegin)>(autoPolicy(v.size()), zBegin, zEnd, firstNonFinite<T>()) - zBegin;
     v.resize(size);
     x.resize(size, -1);
     p.resize(size, -1);
@@ -141,13 +140,12 @@ class SparseIndices {
     VecDH<T> result(size);
     VecDH<char> found(size);
     VecDH<int> temp(size);
-    thrust::fill(thrust::device, result.begin(), result.end(), missingVal);
-    thrust::binary_search(thrust::device, beginPQ(), endPQ(), pqBegin, pqEnd,
-                          found.begin());
-    thrust::lower_bound(thrust::device, beginPQ(), endPQ(), pqBegin, pqEnd,
-                        temp.begin());
-    thrust::gather_if(thrust::device, temp.begin(), temp.end(), found.begin(),
-                      val.begin(), result.begin());
+    auto policy = autoPolicy(size);
+    fill(policy, result.begin(), result.end(), missingVal);
+    binary_search(policy, beginPQ(), endPQ(), pqBegin, pqEnd, found.begin());
+    lower_bound(policy, beginPQ(), endPQ(), pqBegin, pqEnd, temp.begin());
+    gather_if(policy, temp.begin(), temp.end(), found.begin(), val.begin(),
+                      result.begin());
     return result;
   }
 
diff --git a/utilities/include/utils.h b/utilities/include/utils.h
index c00a15115..d2a295d63 100644
--- a/utilities/include/utils.h
+++ b/utilities/include/utils.h
@@ -23,6 +23,8 @@
 
 #include <iostream>
 
+#include "par.h"
+
 namespace manifold {
 
 /** @defgroup Private
@@ -40,8 +42,12 @@ inline void MemUsage() {
 
 inline void CheckDevice() {
 #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) throw std::runtime_error(cudaGetErrorString(error));
+  if (CUDA_ENABLED == -1)
+    check_cuda_available();
+  if (CUDA_ENABLED) {
+    cudaError_t error = cudaGetLastError();
+    if (error != cudaSuccess) throw std::runtime_error(cudaGetErrorString(error));
+  }
 #endif
 }
 
diff --git a/utilities/include/vec_dh.h b/utilities/include/vec_dh.h
index f119fd57f..a13431a79 100644
--- a/utilities/include/vec_dh.h
+++ b/utilities/include/vec_dh.h
@@ -13,11 +13,286 @@
 // limitations under the License.
 
 #pragma once
-#include <thrust/execution_policy.h>
-#include <thrust/universal_vector.h>
+#include <chrono>
+#ifdef MANIFOLD_USE_CUDA
+#include <cuda.h>
+#endif
+#include <iostream>
+
+#include "par.h"
+#include "structs.h"
 
 namespace manifold {
 
+// Vector implementation optimized for managed memory, will perform memory
+// prefetching to minimize page faults and use parallel/GPU copy/fill depending
+// on data size. This will also handle builds without CUDA or builds with CUDA
+// but runned without CUDA GPU properly.
+//
+// Note that the constructor and resize function will not perform initialization
+// if the parameter val is not set. Also, this implementation is a toy
+// implementation that did not consider things like non-trivial
+// constructor/destructor, please keep T trivial.
+template <typename T> class ManagedVec {
+public:
+  typedef T *Iter;
+  typedef const T *IterC;
+
+  ManagedVec() {
+    size_ = 0;
+    capacity_ = 0;
+    onHost = true;
+  }
+
+  // note that we leave the memory uninitialized
+  ManagedVec(size_t n) {
+    size_ = n;
+    capacity_ = n;
+    onHost = autoPolicy(n) != ExecutionPolicy::ParUnseq;
+    if (n == 0)
+      return;
+    mallocManaged(&ptr_, size_ * sizeof(T));
+  }
+
+  ManagedVec(size_t n, const T &val) {
+    size_ = n;
+    capacity_ = n;
+    if (n == 0)
+      return;
+    auto policy = autoPolicy(n);
+    onHost = policy != ExecutionPolicy::ParUnseq;
+    mallocManaged(&ptr_, size_ * sizeof(T));
+    prefetch(ptr_, size_ * sizeof(T), onHost);
+    uninitialized_fill_n(policy, ptr_, n, val);
+  }
+
+  ~ManagedVec() {
+    if (ptr_ != nullptr)
+      freeManaged(ptr_);
+    ptr_ = nullptr;
+    size_ = 0;
+    capacity_ = 0;
+  }
+
+  ManagedVec(const std::vector<T> &vec) {
+    size_ = vec.size();
+    capacity_ = size_;
+    auto policy = autoPolicy(size_);
+    onHost = policy != ExecutionPolicy::ParUnseq;
+    if (size_ != 0) {
+      mallocManaged(&ptr_, size_ * sizeof(T));
+      fastUninitializedCopy(ptr_, vec.data(), size_, policy);
+    }
+  }
+
+  ManagedVec(const ManagedVec<T> &vec) {
+    size_ = vec.size_;
+    capacity_ = size_;
+    auto policy = autoPolicy(size_);
+    onHost = policy != ExecutionPolicy::ParUnseq;
+    if (size_ != 0) {
+      mallocManaged(&ptr_, size_ * sizeof(T));
+      prefetch(ptr_, size_ * sizeof(T), onHost);
+      uninitialized_copy(policy, vec.begin(), vec.end(), ptr_);
+    }
+  }
+
+  ManagedVec(ManagedVec<T> &&vec) {
+    ptr_ = vec.ptr_;
+    size_ = vec.size_;
+    capacity_ = vec.capacity_;
+    onHost = vec.onHost;
+    vec.ptr_ = nullptr;
+    vec.size_ = 0;
+    vec.capacity_ = 0;
+  }
+
+  ManagedVec &operator=(const ManagedVec<T> &vec) {
+    if (&vec == this)
+      return *this;
+    if (ptr_ != nullptr)
+      freeManaged(ptr_);
+    size_ = vec.size_;
+    capacity_ = vec.size_;
+    auto policy = autoPolicy(size_);
+    onHost = policy != ExecutionPolicy::ParUnseq;
+    if (size_ != 0) {
+      mallocManaged(&ptr_, size_ * sizeof(T));
+      prefetch(ptr_, size_ * sizeof(T), onHost);
+      uninitialized_copy(policy, vec.begin(), vec.end(), ptr_);
+    }
+    return *this;
+  }
+
+  ManagedVec &operator=(ManagedVec<T> &&vec) {
+    if (&vec == this)
+      return *this;
+    if (ptr_ != nullptr)
+      freeManaged(ptr_);
+    onHost = vec.onHost;
+    size_ = vec.size_;
+    capacity_ = vec.capacity_;
+    ptr_ = vec.ptr_;
+    vec.ptr_ = nullptr;
+    vec.size_ = 0;
+    vec.capacity_ = 0;
+    return *this;
+  }
+
+  void resize(size_t n) {
+    reserve(n);
+    size_ = n;
+  }
+
+  void resize(size_t n, const T &val) {
+    reserve(n);
+    if (size_ < n) {
+      uninitialized_fill(autoPolicy(n - size_), ptr_ + size_, ptr_ + n, val);
+    }
+    size_ = n;
+  }
+
+  void reserve(size_t n) {
+    if (n > capacity_) {
+      T *newBuffer;
+      mallocManaged(&newBuffer, n * sizeof(T));
+      prefetch(newBuffer, size_ * sizeof(T), onHost);
+      if (size_ > 0) {
+        uninitialized_copy(autoPolicy(size_), ptr_, ptr_ + size_, newBuffer);
+      }
+      if (ptr_ != nullptr)
+        freeManaged(ptr_);
+      ptr_ = newBuffer;
+      capacity_ = n;
+    }
+  }
+
+  void shrink_to_fit() {
+    T *newBuffer = nullptr;
+    if (size_ > 0) {
+      mallocManaged(&newBuffer, size_ * sizeof(T));
+      prefetch(newBuffer, size_ * sizeof(T), onHost);
+      uninitialized_copy(autoPolicy(size_), ptr_, ptr_ + size_, newBuffer);
+    }
+    freeManaged(ptr_);
+    ptr_ = newBuffer;
+    capacity_ = size_;
+  }
+
+  void push_back(const T &val) {
+    if (size_ >= capacity_) {
+      // avoid dangling pointer in case val is a reference of our array
+      T val_copy = val;
+      reserve(capacity_ == 0 ? 128 : capacity_ * 2);
+      onHost = true;
+      ptr_[size_++] = val_copy;
+      return;
+    }
+    ptr_[size_++] = val;
+  }
+
+  void swap(ManagedVec<T> &other) {
+    std::swap(ptr_, other.ptr_);
+    std::swap(size_, other.size_);
+    std::swap(capacity_, other.capacity_);
+    std::swap(onHost, other.onHost);
+  }
+
+  void prefetch_to(bool toHost) const {
+    if (toHost != onHost)
+      prefetch(ptr_, size_ * sizeof(T), toHost);
+    onHost = toHost;
+  }
+
+  IterC cbegin() const { return ptr_; }
+
+  IterC cend() const { return ptr_ + size_; }
+
+  Iter begin() { return ptr_; }
+
+  Iter end() { return ptr_ + size_; }
+
+  IterC begin() const { return cbegin(); }
+
+  IterC end() const { return cend(); }
+
+  T *data() { return ptr_; }
+
+  const T *data() const { return ptr_; }
+
+  size_t size() const { return size_; }
+
+  size_t capacity() const { return capacity_; }
+
+  T &front() { return *ptr_; }
+
+  const T &front() const { return *ptr_; }
+
+  T &back() { return *(ptr_ + size_ - 1); }
+
+  const T &back() const { return *(ptr_ + size_ - 1); }
+
+  T &operator[](size_t i) { return *(ptr_ + i); }
+
+  const T &operator[](size_t i) const { return *(ptr_ + i); }
+
+  bool empty() const { return size_ == 0; }
+
+private:
+  T *ptr_ = nullptr;
+  size_t size_ = 0;
+  size_t capacity_ = 0;
+  mutable bool onHost = true;
+
+  static constexpr int DEVICE_MAX_BYTES = 1 << 16;
+
+  static void mallocManaged(T **ptr, size_t bytes) {
+#ifdef MANIFOLD_USE_CUDA
+    if (CUDA_ENABLED == -1)
+      check_cuda_available();
+    if (CUDA_ENABLED)
+      cudaMallocManaged(reinterpret_cast<void**>(ptr), bytes);
+    else
+#endif
+      *ptr = reinterpret_cast<T *>(malloc(bytes));
+  }
+
+  static void freeManaged(T *ptr) {
+#ifdef MANIFOLD_USE_CUDA
+    if (CUDA_ENABLED)
+      cudaFree(ptr);
+    else
+#endif
+      free(ptr);
+  }
+
+  static void prefetch(T *ptr, int bytes, bool onHost) {
+#ifdef MANIFOLD_USE_CUDA
+    if (bytes > 0 && CUDA_ENABLED)
+      cudaMemPrefetchAsync(ptr, std::min(bytes, DEVICE_MAX_BYTES),
+                           onHost ? cudaCpuDeviceId : 0);
+#endif
+  }
+
+  // fast routine for memcpy from std::vector to ManagedVec
+  static void fastUninitializedCopy(T *dst, const T *src, int n,
+                                    ExecutionPolicy policy) {
+    prefetch(dst, n * sizeof(T), policy != ExecutionPolicy::ParUnseq);
+    switch (policy) {
+    case ExecutionPolicy::ParUnseq:
+#ifdef MANIFOLD_USE_CUDA
+      cudaMemcpy(dst, src, n * sizeof(T), cudaMemcpyHostToDevice);
+#endif
+    case ExecutionPolicy::Par:
+      thrust::uninitialized_copy_n(thrust::MANIFOLD_PAR_NS::par, src, n, dst);
+      break;
+    case ExecutionPolicy::Seq:
+      thrust::uninitialized_copy_n(thrust::cpp::par, src, n, dst);
+      break;
+    }
+  }
+};
+
 /*
  * Host and device vector implementation. This uses `thrust::universal_vector`
  * for storage, so data can be moved by the hardware on demand, allows using
@@ -35,268 +310,136 @@ namespace manifold {
  * some device(host) modification, and then read the host(device) pointer again
  * (on the same vector). The memory will be inconsistent in that case.
  */
-template <typename T>
-class VecDH {
- public:
-  VecDH() { impl_ = thrust::universal_vector<T>(); }
-
-  VecDH(int size, T val = T()) {
-    impl_.resize(size, val);
-    implModified = true;
-    cacheModified = false;
-  }
-
-  VecDH(const std::vector<T>& vec) {
-    cache = vec;
-    cacheModified = true;
-    implModified = false;
-  }
-
-  VecDH(const VecDH<T>& other) {
-    if (!other.cacheModified) {
-      if (other.impl_.size() > 0)
-        impl_ = other.impl_;
-      else
-        impl_.clear();
-      implModified = true;
-      cacheModified = false;
-    } else {
-      cache = other.cache;
-      implModified = false;
-      cacheModified = true;
-    }
-  }
+template <typename T> class VecDH {
+public:
+  VecDH() {}
 
-  VecDH(VecDH<T>&& other) {
-    if (!other.cacheModified) {
-      if (other.impl_.size() > 0)
-        impl_ = std::move(other.impl_);
-      else
-        impl_.clear();
-      implModified = true;
-      cacheModified = false;
-    } else {
-      cache = std::move(other.cache);
-      cacheModified = true;
-      implModified = false;
-    }
-  }
+  // Note that the vector constructed with this constructor will contain
+  // uninitialized memory. Please specify `val` if you need to make sure that
+  // the data is initialized.
+  VecDH(int size) { impl_.resize(size); }
 
-  VecDH<T>& operator=(const VecDH<T>& other) {
-    if (!other.cacheModified) {
-      if (other.impl_.size() > 0)
-        impl_ = other.impl_;
-      else
-        impl_.clear();
-      implModified = true;
-      cacheModified = false;
-    } else {
-      cache = other.cache;
-      cacheModified = true;
-      implModified = false;
-    }
+  VecDH(int size, T val) { impl_.resize(size, val); }
+
+  VecDH(const std::vector<T> &vec) { impl_ = vec; }
+
+  VecDH(const VecDH<T> &other) { impl_ = other.impl_; }
+
+  VecDH(VecDH<T> &&other) { impl_ = std::move(other.impl_); }
+
+  VecDH<T> &operator=(const VecDH<T> &other) {
+    impl_ = other.impl_;
     return *this;
   }
 
-  VecDH<T>& operator=(VecDH<T>&& other) {
-    if (!other.cacheModified) {
-      if (other.impl_.size() > 0)
-        impl_ = std::move(other.impl_);
-      else
-        impl_.clear();
-      implModified = true;
-      cacheModified = false;
-    } else {
-      cache = std::move(other.cache);
-      cacheModified = true;
-      implModified = false;
-    }
+  VecDH<T> &operator=(VecDH<T> &&other) {
+    impl_ = std::move(other.impl_);
     return *this;
   }
 
-  int size() const {
-    if (!cacheModified) return impl_.size();
-    return cache.size();
-  }
+  int size() const { return impl_.size(); }
 
   void resize(int newSize, T val = T()) {
     bool shrink = size() > 2 * newSize;
-    if (cacheModified) {
-      cache.resize(newSize, val);
-      if (shrink) cache.shrink_to_fit();
-    } else {
-      impl_.resize(newSize, val);
-      if (shrink) impl_.shrink_to_fit();
-      implModified = true;
-    }
+    impl_.resize(newSize, val);
+    if (shrink)
+      impl_.shrink_to_fit();
   }
 
-  void swap(VecDH<T>& other) {
-    if (!cacheModified && !other.cacheModified) {
-      implModified = true;
-      other.implModified = true;
-      impl_.swap(other.impl_);
-    } else {
-      syncCache();
-      other.syncCache();
-      cacheModified = true;
-      implModified = false;
-      other.cacheModified = true;
-      other.implModified = false;
-      cache.swap(other.cache);
-    }
-  }
+  void swap(VecDH<T> &other) { impl_.swap(other.impl_); }
 
-  using Iter = typename thrust::universal_vector<T>::iterator;
-  using IterC = typename thrust::universal_vector<T>::const_iterator;
+  using Iter = typename ManagedVec<T>::Iter;
+  using IterC = typename ManagedVec<T>::IterC;
 
   Iter begin() {
-    syncImpl();
-    implModified = true;
+    impl_.prefetch_to(autoPolicy(size()) != ExecutionPolicy::ParUnseq);
     return impl_.begin();
   }
 
-  Iter end() {
-    syncImpl();
-    implModified = true;
-    return impl_.end();
-  }
+  Iter end() { return impl_.end(); }
 
   IterC cbegin() const {
-    syncImpl();
+    impl_.prefetch_to(autoPolicy(size()) != ExecutionPolicy::ParUnseq);
     return impl_.cbegin();
   }
 
-  IterC cend() const {
-    syncImpl();
-    return impl_.cend();
-  }
+  IterC cend() const { return impl_.cend(); }
 
   IterC begin() const { return cbegin(); }
   IterC end() const { return cend(); }
 
-  T* ptrD() {
-    if (size() == 0) return nullptr;
-    syncImpl();
-    implModified = true;
-    return impl_.data().get();
+  T *ptrD() {
+    if (size() == 0)
+      return nullptr;
+    impl_.prefetch_to(autoPolicy(size()) != ExecutionPolicy::ParUnseq);
+    return impl_.data();
   }
 
-  const T* cptrD() const {
-    if (size() == 0) return nullptr;
-    syncImpl();
-    return impl_.data().get();
+  const T *cptrD() const {
+    if (size() == 0)
+      return nullptr;
+    impl_.prefetch_to(autoPolicy(size()) != ExecutionPolicy::ParUnseq);
+    return impl_.data();
   }
 
-  const T* ptrD() const { return cptrD(); }
+  const T *ptrD() const { return cptrD(); }
 
-  T* ptrH() {
-    if (size() == 0) return nullptr;
-    if (cacheModified) {
-      return cache.data();
-    } else {
-      implModified = true;
-      return impl_.data().get();
-    }
+  T *ptrH() {
+    if (size() == 0)
+      return nullptr;
+    impl_.prefetch_to(true);
+    return impl_.data();
   }
 
-  const T* cptrH() const {
-    if (size() == 0) return nullptr;
-    if (cacheModified) {
-      return cache.data();
-    } else {
-      return impl_.data().get();
-    }
+  const T *cptrH() const {
+    if (size() == 0)
+      return nullptr;
+    impl_.prefetch_to(true);
+    return impl_.data();
   }
 
-  const T* ptrH() const { return cptrH(); }
+  const T *ptrH() const { return cptrH(); }
 
-  T& operator[](int i) {
-    if (!cacheModified) {
-      implModified = true;
-      return impl_[i];
-    } else {
-      cacheModified = true;
-      return cache[i];
-    }
+  T &operator[](int i) {
+    impl_.prefetch_to(true);
+    return impl_[i];
   }
 
-  const T& operator[](int i) const {
-    if (!cacheModified) {
-      return impl_[i];
-    } else {
-      return cache[i];
-    }
+  const T &operator[](int i) const {
+    impl_.prefetch_to(true);
+    return impl_[i];
   }
 
-  T& back() {
-    if (!cacheModified) {
-      implModified = true;
-      return impl_.back();
-    } else {
-      return cache.back();
-    }
-  }
+  T &back() { return impl_.back(); }
 
-  const T& back() const {
-    if (!cacheModified) {
-      return impl_.back();
-    } else {
-      return cache.back();
-    }
-  }
+  const T &back() const { return impl_.back(); }
 
-  void push_back(const T& val) {
-    syncCache();
-    cacheModified = true;
-    cache.push_back(val);
-  }
+  void push_back(const T &val) { impl_.push_back(val); }
 
-  void reserve(int n) {
-    syncCache();
-    cacheModified = true;
-    cache.reserve(n);
-  }
+  void reserve(int n) { impl_.reserve(n); }
 
   void Dump() const {
-    syncCache();
-    manifold::Dump(cache);
-  }
-
- private:
-  mutable thrust::universal_vector<T> impl_;
-
-  mutable bool implModified = false;
-  mutable bool cacheModified = false;
-  mutable std::vector<T> cache;
-
-  void syncImpl() const {
-    if (cacheModified) {
-      impl_ = cache;
-      cache.clear();
+    std::cout << "VecDH = " << std::endl;
+    for (int i = 0; i < impl_.size(); ++i) {
+      std::cout << i << ", " << impl_[i] << ", " << std::endl;
     }
-    cacheModified = false;
+    std::cout << std::endl;
   }
 
-  void syncCache() const {
-    if (implModified || cache.empty()) {
-      cache = std::vector<T>(impl_.begin(), impl_.end());
-    }
-    implModified = false;
-  }
+private:
+  ManagedVec<T> impl_;
 };
 
-template <typename T>
-class VecD {
- public:
-  VecD(const VecDH<T>& vec) : ptr_(vec.ptrD()), size_(vec.size()) {}
+template <typename T> class VecD {
+public:
+  VecD(const VecDH<T> &vec) : ptr_(vec.ptrD()), size_(vec.size()) {}
 
-  __host__ __device__ const T& operator[](int i) const { return ptr_[i]; }
+  __host__ __device__ const T &operator[](int i) const { return ptr_[i]; }
   __host__ __device__ int size() const { return size_; }
 
- private:
-  T const* const ptr_;
+private:
+  T const *const ptr_;
   const int size_;
 };
 /** @} */
-}  // namespace manifold
+} // namespace manifold
diff --git a/utilities/src/detect_cuda.cpp b/utilities/src/detect_cuda.cpp
new file mode 100644
index 000000000..e3aa80b8e
--- /dev/null
+++ b/utilities/src/detect_cuda.cpp
@@ -0,0 +1,16 @@
+#ifdef MANIFOLD_USE_CUDA
+#include <cuda_runtime.h>
+
+namespace manifold {
+int CUDA_ENABLED = -1;
+void check_cuda_available() {
+  int device_count = 0;
+  cudaError_t error = cudaGetDeviceCount(&device_count);
+  CUDA_ENABLED = device_count != 0;
+}
+}
+#else
+namespace manifold {
+void check_cuda_available() {}
+}
+#endif