* Upgrade presets for CUDA 12.3.1, cuDNN 8.9.7, NCCL 2.19.3, nvCOMP …

…3.0.5, DepthAI 2.24.0
bytedeco · Dec 29, 2023 · a885e6c · a885e6c
1 parent 6e36cac
commit a885e6c
Show file tree

Hide file tree

Showing 66 changed files with 1,165 additions and 291 deletions.
diff --git a/.github/actions/deploy-ubuntu/action.yml b/.github/actions/deploy-ubuntu/action.yml
@@ -31,29 +31,29 @@ runs:
           export ARCH=arm64
           export ARCH_CUDA=sbsa
           export PREFIX=aarch64-linux-gnu
-          export CUDA=cuda-repo-rhel8-12-3-local-12.3.0_545.23.06-1.aarch64.rpm
-          export CUDNN=8.9.5.29-1.cuda12.2.aarch64
-          export NCCL=2.18.5-1+cuda12.2.aarch64
-          export NVCOMP=nvcomp_3.0.4_SBSA_12.x
+          export CUDA=cuda-repo-rhel8-12-3-local-12.3.1_545.23.08-1.aarch64.rpm
+          export CUDNN=8.9.7.29-1.cuda12.2.aarch64
+          export NCCL=2.19.3-1+cuda12.3.aarch64
+          export NVCOMP=nvcomp_3.0.5_SBSA_12.x
           export USERLAND_BUILDME="buildme --aarch64"
         elif [[ "$CI_DEPLOY_PLATFORM" == "linux-ppc64le" ]]; then
           export ARCH=ppc64el
           export ARCH_CUDA=ppc64le
           export PREFIX=powerpc64le-linux-gnu
-          export CUDA=cuda-repo-rhel8-12-3-local-12.3.0_545.23.06-1.ppc64le.rpm
-          export CUDNN=8.9.5.29-1.cuda12.2.ppc64le
-          export NCCL=2.18.5-1+cuda12.2.ppc64le
+          export CUDA=cuda-repo-rhel8-12-3-local-12.3.1_545.23.08-1.ppc64le.rpm
+          export CUDNN=8.9.7.29-1.cuda12.2.ppc64le
+          export NCCL=2.19.3-1+cuda12.3.ppc64le
         elif [[ "$CI_DEPLOY_PLATFORM" == "linux-x86" ]]; then
           export ARCH=i386
           export PREFIX=i686-linux-gnu
         elif [[ "$CI_DEPLOY_PLATFORM" == "linux-x86_64" ]]; then
           export ARCH=amd64
           export ARCH_CUDA=x86_64
           export PREFIX=x86_64-linux-gnu
-          export CUDA=cuda-repo-rhel8-12-3-local-12.3.0_545.23.06-1.x86_64.rpm
-          export CUDNN=8.9.5.29-1.cuda12.2.x86_64
-          export NCCL=2.18.5-1+cuda12.2.x86_64
-          export NVCOMP=nvcomp_3.0.4_x86_64_12.x
+          export CUDA=cuda-repo-rhel8-12-3-local-12.3.1_545.23.08-1.x86_64.rpm
+          export CUDNN=8.9.7.29-1.cuda12.2.x86_64
+          export NCCL=2.19.3-1+cuda12.3.x86_64
+          export NVCOMP=nvcomp_3.0.5_x86_64_12.x
         fi
         echo "ARCH=$ARCH" >> $GITHUB_ENV
         echo "PREFIX=$PREFIX" >> $GITHUB_ENV
@@ -140,7 +140,7 @@ runs:
 
         if [[ -n ${ARCH_CUDA:-} ]] && [[ -n ${CI_DEPLOY_NEED_CUDA:-} ]]; then
           echo Installing CUDA, cuDNN, nvCOMP, etc
-          curl -LO https://developer.download.nvidia.com/compute/cuda/12.3.0/local_installers/$CUDA
+          curl -LO https://developer.download.nvidia.com/compute/cuda/12.3.1/local_installers/$CUDA
           curl -LO https://developer.download.nvidia.com/compute/cuda/repos/rhel8/$ARCH_CUDA/libcudnn8-$CUDNN.rpm
           curl -LO https://developer.download.nvidia.com/compute/cuda/repos/rhel8/$ARCH_CUDA/libcudnn8-devel-$CUDNN.rpm
           curl -LO https://developer.download.nvidia.com/compute/cuda/repos/rhel8/$ARCH_CUDA/libnccl-$NCCL.rpm
@@ -156,9 +156,9 @@ runs:
           $SUDO mv /usr/lib64/libcudnn* /usr/lib64/libnccl* /usr/local/cuda/lib64/
 
           if [[ -n ${NVCOMP:-} ]]; then
-            curl -LO https://developer.download.nvidia.com/compute/nvcomp/3.0.4/local_installers/$NVCOMP.tgz
-            $SUDO tar -xvf nvcomp_*.tgz -C /usr/local/cuda/lib64/ --strip-components=1 lib/
-            $SUDO tar -xvf nvcomp_*.tgz -C /usr/local/cuda/include/ --strip-components=1 include/
+            curl -LO https://developer.download.nvidia.com/compute/nvcomp/3.0.5/local_installers/$NVCOMP.tgz
+            $SUDO tar -xvf $NVCOMP.tgz -C /usr/local/cuda/lib64/ --strip-components=1 lib/ || $SUDO tar -xvf $NVCOMP.tgz -C /usr/local/cuda/lib64/ --strip-components=2 nvcomp-3.0.5-ctk-12.2/lib/
+            $SUDO tar -xvf $NVCOMP.tgz -C /usr/local/cuda/include/ --strip-components=1 include/ || $SUDO tar -xvf $NVCOMP.tgz -C /usr/local/cuda/include/ --strip-components=2 nvcomp-3.0.5-ctk-12.2/include/
             rm -f $NVCOMP.tgz
           fi
 

diff --git a/.github/actions/deploy-windows/action.yml b/.github/actions/deploy-windows/action.yml
@@ -94,27 +94,27 @@ runs:
         if "%CI_DEPLOY_PLATFORM%"=="windows-x86_64" if not "%CI_DEPLOY_NEED_CUDA%"=="" (
           echo Installing CUDA, cuDNN, nvCOMP, etc
           curl -LO https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_522.06_windows.exe
-          curl -LO https://developer.download.nvidia.com/compute/cuda/12.3.0/local_installers/cuda_12.3.0_545.84_windows.exe
+          curl -LO https://developer.download.nvidia.com/compute/cuda/12.3.1/local_installers/cuda_12.3.1_546.12_windows.exe
           rem curl -LO https://developer.download.nvidia.com/compute/redist/cudnn/v8.8.0/local_installers/12.0/cudnn_8.8.0.121_windows.exe
-          python -m gdown.cli https://drive.google.com/uc?id=1-5QHvwDZC_1rhn5W6fRHNWicXRPtqt31
+          python -m gdown.cli https://drive.google.com/uc?id=135Z7zfwguQe6vn7p013HtVkHFu9-_rru
           curl -LO http://www.winimage.com/zLibDll/zlib123dllx64.zip
-          curl -LO https://developer.download.nvidia.com/compute/nvcomp/3.0.4/local_installers/nvcomp_3.0.4_windows_12.x.zip
+          curl -LO https://developer.download.nvidia.com/compute/nvcomp/3.0.5/local_installers/nvcomp_3.0.5_windows_12.x.zip
           cuda_11.8.0_522.06_windows.exe -s
           bash -c "rm -Rf 'C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.8'"
           bash -c "mv 'C:/Program Files/NVIDIA Corporation/NvToolsExt' 'C:/Program Files/NVIDIA Corporation/NvToolsExt_old'"
-          cuda_12.3.0_545.84_windows.exe -s
+          cuda_12.3.1_546.12_windows.exe -s
           bash -c "mv 'C:/Program Files/NVIDIA Corporation/NvToolsExt_old' 'C:/Program Files/NVIDIA Corporation/NvToolsExt'"
           bash -c "ls 'C:/Program Files/NVIDIA Corporation/NvToolsExt'"
           rem cudnn_8.8.0.121_windows.exe -s
-          unzip cudnn-windows-x86_64-8.9.5.29_cuda12-archive.zip
+          unzip cudnn-windows-x86_64-8.9.7.29_cuda12-archive.zip
           unzip zlib123dllx64.zip
-          unzip nvcomp_3.0.4_windows_12.x.zip
+          unzip nvcomp_3.0.5_windows_12.x.zip
           rem move "%ProgramFiles%\NVIDIA\CUDNN\v8.8\bin\*.dll" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\bin"
           rem move "%ProgramFiles%\NVIDIA\CUDNN\v8.8\include\*.h" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\include"
           rem move "%ProgramFiles%\NVIDIA\CUDNN\v8.8\lib\x64\*.lib" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\lib\x64"
-          move cudnn-windows-x86_64-8.9.5.29_cuda12-archive\bin\*.dll "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\bin"
-          move cudnn-windows-x86_64-8.9.5.29_cuda12-archive\include\*.h "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\include"
-          move cudnn-windows-x86_64-8.9.5.29_cuda12-archive\lib\x64\*.lib "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\lib\x64"
+          move cudnn-windows-x86_64-8.9.7.29_cuda12-archive\bin\*.dll "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\bin"
+          move cudnn-windows-x86_64-8.9.7.29_cuda12-archive\include\*.h "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\include"
+          move cudnn-windows-x86_64-8.9.7.29_cuda12-archive\lib\x64\*.lib "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\lib\x64"
           move dll_x64\zlibwapi.dll "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\bin"
           move include\* "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\include"
           move include\gdeflate "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\include"
@@ -229,7 +229,7 @@ runs:
           set "CUDA_PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3"
           set "CUDA_PATH_V12_3=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3"
           set "PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\bin;%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\libnvvp;%PATH%"
-          echo CUDA Version 12.3.0>"%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\version.txt"
+          echo CUDA Version 12.3.1>"%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\version.txt"
         )
         set "CCACHE_DIR=%USERPROFILE%\ccache"
         set "PATH=C:\hostedtoolcache\windows\Python\3.9.13\x64;C:\msys64\%MSYSTEM%\bin;C:\msys64\usr\bin;%ProgramFiles%\apache-maven-3.6.3\bin;%PATH%"

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,11 +1,11 @@
 
  * Include `timeapi.h` for system API of Windows ([pull #1447](https://github.com/bytedeco/javacpp-presets/pull/1447))
  * Add Android and Windows builds to presets for DepthAI ([pull #1441](https://github.com/bytedeco/javacpp-presets/pull/1441))
- * Add presets for nvCOMP 3.0.4 ([pull #1434](https://github.com/bytedeco/javacpp-presets/pull/1434)), SentencePiece 0.1.99 ([pull #1384](https://github.com/bytedeco/javacpp-presets/pull/1384))
+ * Add presets for nvCOMP 3.0.5 ([pull #1434](https://github.com/bytedeco/javacpp-presets/pull/1434)), SentencePiece 0.1.99 ([pull #1384](https://github.com/bytedeco/javacpp-presets/pull/1384))
  * Refactor and improve presets for PyTorch ([pull #1360](https://github.com/bytedeco/javacpp-presets/pull/1360))
  * Include `mkl_lapack.h` header file in presets for MKL ([issue #1388](https://github.com/bytedeco/javacpp-presets/issues/1388))
  * Map new higher-level C++ API of Triton Inference Server ([pull #1361](https://github.com/bytedeco/javacpp-presets/pull/1361))
- * Upgrade presets for OpenCV 4.8.1, FFmpeg 6.1, HDF5 1.14.3, MKL 2024.0, DNNL 3.3.3, OpenBLAS 0.3.25, ARPACK-NG 3.9.1, CPython 3.12.1, NumPy 1.26.2, SciPy 1.11.4, LLVM 17.0.6, Leptonica 1.83.1, Tesseract 5.3.3, CUDA 12.3.0, cuDNN 8.9.5, NCCL 2.18.5, PyTorch 2.1.2 ([pull #1426](https://github.com/bytedeco/javacpp-presets/pull/1426)), TensorFlow Lite 2.15.0, Triton Inference Server 2.38.0, DepthAI 2.23.0, ONNX 1.15.0, ONNX Runtime 1.16.3, TVM 0.14.0, and their dependencies
+ * Upgrade presets for OpenCV 4.8.1, FFmpeg 6.1, HDF5 1.14.3, MKL 2024.0, DNNL 3.3.3, OpenBLAS 0.3.25, ARPACK-NG 3.9.1, CPython 3.12.1, NumPy 1.26.2, SciPy 1.11.4, LLVM 17.0.6, Leptonica 1.83.1, Tesseract 5.3.3, CUDA 12.3.1, cuDNN 8.9.7, NCCL 2.19.3, PyTorch 2.1.2 ([pull #1426](https://github.com/bytedeco/javacpp-presets/pull/1426)), TensorFlow Lite 2.15.0, Triton Inference Server 2.38.0, DepthAI 2.24.0, ONNX 1.15.0, ONNX Runtime 1.16.3, TVM 0.14.0, and their dependencies
 
 ### June 6, 2023 version 1.5.9
  * Virtualize `nvinfer1::IGpuAllocator` from TensorRT to allow customization ([pull #1367](https://github.com/bytedeco/javacpp-presets/pull/1367))

diff --git a/README.md b/README.md
@@ -230,7 +230,7 @@ Each child module in turn relies by default on the included [`cppbuild.sh` scrip
  * TensorRT 8.6.x  https://developer.nvidia.com/tensorrt
  * Triton Inference Server 2.38.x  https://developer.nvidia.com/nvidia-triton-inference-server
  * The Arcade Learning Environment 0.8.x  https://github.com/mgbellemare/Arcade-Learning-Environment
- * DepthAI 2.23.x  https://github.com/luxonis/depthai-core
+ * DepthAI 2.24.x  https://github.com/luxonis/depthai-core
  * ONNX 1.15.x  https://github.com/onnx/onnx
  * nGraph 0.26.0  https://github.com/NervanaSystems/ngraph
  * ONNX Runtime 1.16.x  https://github.com/microsoft/onnxruntime

diff --git a/cuda/README.md b/cuda/README.md
@@ -25,10 +25,10 @@ Introduction
 ------------
 This directory contains the JavaCPP Presets module for:
 
- * CUDA 12.3.0  https://developer.nvidia.com/cuda-zone
- * cuDNN 8.9.5  https://developer.nvidia.com/cudnn
- * NCCL 2.18.5  https://developer.nvidia.com/nccl
- * nvCOMP 3.0.4  https://developer.nvidia.com/nvcomp
+ * CUDA 12.3.1  https://developer.nvidia.com/cuda-zone
+ * cuDNN 8.9.7  https://developer.nvidia.com/cudnn
+ * NCCL 2.19.3  https://developer.nvidia.com/nccl
+ * nvCOMP 3.0.5  https://developer.nvidia.com/nvcomp
 
 Please refer to the parent README.md file for more detailed information about the JavaCPP Presets.
 

diff --git a/cuda/src/gen/java/org/bytedeco/cuda/global/cublas.java b/cuda/src/gen/java/org/bytedeco/cuda/global/cublas.java
@@ -103,8 +103,8 @@ public class cublas extends org.bytedeco.cuda.presets.cublas {
 
 public static final int CUBLAS_VER_MAJOR = 12;
 public static final int CUBLAS_VER_MINOR = 3;
-public static final int CUBLAS_VER_PATCH = 2;
-public static final int CUBLAS_VER_BUILD = 9;
+public static final int CUBLAS_VER_PATCH = 4;
+public static final int CUBLAS_VER_BUILD = 1;
 public static final int CUBLAS_VERSION = (CUBLAS_VER_MAJOR * 10000 + CUBLAS_VER_MINOR * 100 + CUBLAS_VER_PATCH);
 
 /* CUBLAS status type returns */
@@ -14009,7 +14009,7 @@ public static native void cublasZtrmm(@Cast("char") byte side,
 // #include "driver_types.h"
 // #include "cuComplex.h" /* import complex data type */
 
-// #include "cublas_api.h"
+// #include "cublas_v2.h"
 
 // #if defined(__cplusplus)
 // Targeting ../cublas/cublasXtContext.java

diff --git a/cuda/src/gen/java/org/bytedeco/cuda/global/cudart.java b/cuda/src/gen/java/org/bytedeco/cuda/global/cudart.java
@@ -1496,8 +1496,9 @@ public class cudart extends org.bytedeco.cuda.presets.cudart {
     /**
      * This option hints to the JIT compiler the minimum number of CTAs from the
      * kernel’s grid to be mapped to a SM. Optimizations based on this option
-     * need either ::CU_JIT_MAX_REGISTERS or ::CU_JIT_THREADS_PER_BLOCK to be
-     * specified as well.
+     * need the maximum number of threads per block to be specified as well. This
+     * option is ignored when used together with ::CU_JIT_MAX_REGISTERS or
+     * ::CU_JIT_THREADS_PER_BLOCK.
      * Option type: unsigned int\n
      * Applies to: compiler only
     */
@@ -1995,7 +1996,7 @@ interop event. The event must disable timing (i.e.
                                                           B has a launch completion dependency on a kernel A, B may wait
                                                           until A is complete. Alternatively, blocks of B may begin before
                                                           all blocks of A have begun, for example if B can claim execution
-                                                          resources unavaiable to A (e.g. they run on different GPUs) or
+                                                          resources unavailable to A (e.g. they run on different GPUs) or
                                                           if B is a higher priority than A.
                                                           Exercise caution if such an ordering inversion could lead
                                                           to deadlock.
@@ -25512,11 +25513,8 @@ inside of a graph. The graph(s) contained within the body of the conditional nod
                                                 Only one instantiation of the graph may exist at any point in time.
                                                 The graph cannot be cloned.
 
-                                              To set the control value:
-
-                                               In a kernel or kernels at appropriate locations in the graph, insert a call to
-                                                {@code void cudaGraphSetConditional(cudaGraphConditionalHandle handle, unsigned int value)}.
-                                               Supply a default value when creating the handle. */
+                                              To set the control value, supply a default value when creating the handle and/or
+                                              call ::cudaGraphSetConditional from device code.*/
     cudaGraphNodeTypeConditional = 0x0d,
     cudaGraphNodeTypeCount = 0x0d + 1;
 // Targeting ../cudart/cudaChildGraphNodeParams.java
@@ -25790,12 +25788,9 @@ interop event. The event must disable timing (i.e.
                                                        have begun execution. Currently this is a best effort. If a kernel
                                                        B has a launch completion dependency on a kernel A, B may wait
                                                        until A is complete. Alternatively, blocks of B may begin before
-                                                       all blocks of A have begun, for example:
-                                                       <ul>
-                                                         <li>If B can claim execution resources unavaiable to A, for
-                                                             example if they run on different GPUs.
-                                                         <li>If B is a higher priority than A.
-                                                       </ul>
+                                                       all blocks of A have begun, for example if B can claim execution
+                                                       resources unavailable to A (e.g. they run on different GPUs) or
+                                                       if B is a higher priority than A.
                                                        Exercise caution if such an ordering inversion could lead
                                                        to deadlock.
                                                        <br>