Skip to content

Commit

Permalink
* Upgrade presets for CUDA 12.3.1, cuDNN 8.9.7, NCCL 2.19.3, nvCOMP …
Browse files Browse the repository at this point in the history
…3.0.5, DepthAI 2.24.0
  • Loading branch information
saudet committed Dec 29, 2023
1 parent 6e36cac commit a885e6c
Show file tree
Hide file tree
Showing 66 changed files with 1,165 additions and 291 deletions.
30 changes: 15 additions & 15 deletions .github/actions/deploy-ubuntu/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,29 +31,29 @@ runs:
export ARCH=arm64
export ARCH_CUDA=sbsa
export PREFIX=aarch64-linux-gnu
export CUDA=cuda-repo-rhel8-12-3-local-12.3.0_545.23.06-1.aarch64.rpm
export CUDNN=8.9.5.29-1.cuda12.2.aarch64
export NCCL=2.18.5-1+cuda12.2.aarch64
export NVCOMP=nvcomp_3.0.4_SBSA_12.x
export CUDA=cuda-repo-rhel8-12-3-local-12.3.1_545.23.08-1.aarch64.rpm
export CUDNN=8.9.7.29-1.cuda12.2.aarch64
export NCCL=2.19.3-1+cuda12.3.aarch64
export NVCOMP=nvcomp_3.0.5_SBSA_12.x
export USERLAND_BUILDME="buildme --aarch64"
elif [[ "$CI_DEPLOY_PLATFORM" == "linux-ppc64le" ]]; then
export ARCH=ppc64el
export ARCH_CUDA=ppc64le
export PREFIX=powerpc64le-linux-gnu
export CUDA=cuda-repo-rhel8-12-3-local-12.3.0_545.23.06-1.ppc64le.rpm
export CUDNN=8.9.5.29-1.cuda12.2.ppc64le
export NCCL=2.18.5-1+cuda12.2.ppc64le
export CUDA=cuda-repo-rhel8-12-3-local-12.3.1_545.23.08-1.ppc64le.rpm
export CUDNN=8.9.7.29-1.cuda12.2.ppc64le
export NCCL=2.19.3-1+cuda12.3.ppc64le
elif [[ "$CI_DEPLOY_PLATFORM" == "linux-x86" ]]; then
export ARCH=i386
export PREFIX=i686-linux-gnu
elif [[ "$CI_DEPLOY_PLATFORM" == "linux-x86_64" ]]; then
export ARCH=amd64
export ARCH_CUDA=x86_64
export PREFIX=x86_64-linux-gnu
export CUDA=cuda-repo-rhel8-12-3-local-12.3.0_545.23.06-1.x86_64.rpm
export CUDNN=8.9.5.29-1.cuda12.2.x86_64
export NCCL=2.18.5-1+cuda12.2.x86_64
export NVCOMP=nvcomp_3.0.4_x86_64_12.x
export CUDA=cuda-repo-rhel8-12-3-local-12.3.1_545.23.08-1.x86_64.rpm
export CUDNN=8.9.7.29-1.cuda12.2.x86_64
export NCCL=2.19.3-1+cuda12.3.x86_64
export NVCOMP=nvcomp_3.0.5_x86_64_12.x
fi
echo "ARCH=$ARCH" >> $GITHUB_ENV
echo "PREFIX=$PREFIX" >> $GITHUB_ENV
Expand Down Expand Up @@ -140,7 +140,7 @@ runs:
if [[ -n ${ARCH_CUDA:-} ]] && [[ -n ${CI_DEPLOY_NEED_CUDA:-} ]]; then
echo Installing CUDA, cuDNN, nvCOMP, etc
curl -LO https://developer.download.nvidia.com/compute/cuda/12.3.0/local_installers/$CUDA
curl -LO https://developer.download.nvidia.com/compute/cuda/12.3.1/local_installers/$CUDA
curl -LO https://developer.download.nvidia.com/compute/cuda/repos/rhel8/$ARCH_CUDA/libcudnn8-$CUDNN.rpm
curl -LO https://developer.download.nvidia.com/compute/cuda/repos/rhel8/$ARCH_CUDA/libcudnn8-devel-$CUDNN.rpm
curl -LO https://developer.download.nvidia.com/compute/cuda/repos/rhel8/$ARCH_CUDA/libnccl-$NCCL.rpm
Expand All @@ -156,9 +156,9 @@ runs:
$SUDO mv /usr/lib64/libcudnn* /usr/lib64/libnccl* /usr/local/cuda/lib64/
if [[ -n ${NVCOMP:-} ]]; then
curl -LO https://developer.download.nvidia.com/compute/nvcomp/3.0.4/local_installers/$NVCOMP.tgz
$SUDO tar -xvf nvcomp_*.tgz -C /usr/local/cuda/lib64/ --strip-components=1 lib/
$SUDO tar -xvf nvcomp_*.tgz -C /usr/local/cuda/include/ --strip-components=1 include/
curl -LO https://developer.download.nvidia.com/compute/nvcomp/3.0.5/local_installers/$NVCOMP.tgz
$SUDO tar -xvf $NVCOMP.tgz -C /usr/local/cuda/lib64/ --strip-components=1 lib/ || $SUDO tar -xvf $NVCOMP.tgz -C /usr/local/cuda/lib64/ --strip-components=2 nvcomp-3.0.5-ctk-12.2/lib/
$SUDO tar -xvf $NVCOMP.tgz -C /usr/local/cuda/include/ --strip-components=1 include/ || $SUDO tar -xvf $NVCOMP.tgz -C /usr/local/cuda/include/ --strip-components=2 nvcomp-3.0.5-ctk-12.2/include/
rm -f $NVCOMP.tgz
fi
Expand Down
20 changes: 10 additions & 10 deletions .github/actions/deploy-windows/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -94,27 +94,27 @@ runs:
if "%CI_DEPLOY_PLATFORM%"=="windows-x86_64" if not "%CI_DEPLOY_NEED_CUDA%"=="" (
echo Installing CUDA, cuDNN, nvCOMP, etc
curl -LO https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_522.06_windows.exe
curl -LO https://developer.download.nvidia.com/compute/cuda/12.3.0/local_installers/cuda_12.3.0_545.84_windows.exe
curl -LO https://developer.download.nvidia.com/compute/cuda/12.3.1/local_installers/cuda_12.3.1_546.12_windows.exe
rem curl -LO https://developer.download.nvidia.com/compute/redist/cudnn/v8.8.0/local_installers/12.0/cudnn_8.8.0.121_windows.exe
python -m gdown.cli https://drive.google.com/uc?id=1-5QHvwDZC_1rhn5W6fRHNWicXRPtqt31
python -m gdown.cli https://drive.google.com/uc?id=135Z7zfwguQe6vn7p013HtVkHFu9-_rru
curl -LO http://www.winimage.com/zLibDll/zlib123dllx64.zip
curl -LO https://developer.download.nvidia.com/compute/nvcomp/3.0.4/local_installers/nvcomp_3.0.4_windows_12.x.zip
curl -LO https://developer.download.nvidia.com/compute/nvcomp/3.0.5/local_installers/nvcomp_3.0.5_windows_12.x.zip
cuda_11.8.0_522.06_windows.exe -s
bash -c "rm -Rf 'C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.8'"
bash -c "mv 'C:/Program Files/NVIDIA Corporation/NvToolsExt' 'C:/Program Files/NVIDIA Corporation/NvToolsExt_old'"
cuda_12.3.0_545.84_windows.exe -s
cuda_12.3.1_546.12_windows.exe -s
bash -c "mv 'C:/Program Files/NVIDIA Corporation/NvToolsExt_old' 'C:/Program Files/NVIDIA Corporation/NvToolsExt'"
bash -c "ls 'C:/Program Files/NVIDIA Corporation/NvToolsExt'"
rem cudnn_8.8.0.121_windows.exe -s
unzip cudnn-windows-x86_64-8.9.5.29_cuda12-archive.zip
unzip cudnn-windows-x86_64-8.9.7.29_cuda12-archive.zip
unzip zlib123dllx64.zip
unzip nvcomp_3.0.4_windows_12.x.zip
unzip nvcomp_3.0.5_windows_12.x.zip
rem move "%ProgramFiles%\NVIDIA\CUDNN\v8.8\bin\*.dll" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\bin"
rem move "%ProgramFiles%\NVIDIA\CUDNN\v8.8\include\*.h" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\include"
rem move "%ProgramFiles%\NVIDIA\CUDNN\v8.8\lib\x64\*.lib" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\lib\x64"
move cudnn-windows-x86_64-8.9.5.29_cuda12-archive\bin\*.dll "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\bin"
move cudnn-windows-x86_64-8.9.5.29_cuda12-archive\include\*.h "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\include"
move cudnn-windows-x86_64-8.9.5.29_cuda12-archive\lib\x64\*.lib "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\lib\x64"
move cudnn-windows-x86_64-8.9.7.29_cuda12-archive\bin\*.dll "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\bin"
move cudnn-windows-x86_64-8.9.7.29_cuda12-archive\include\*.h "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\include"
move cudnn-windows-x86_64-8.9.7.29_cuda12-archive\lib\x64\*.lib "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\lib\x64"
move dll_x64\zlibwapi.dll "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\bin"
move include\* "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\include"
move include\gdeflate "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\include"
Expand Down Expand Up @@ -229,7 +229,7 @@ runs:
set "CUDA_PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3"
set "CUDA_PATH_V12_3=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3"
set "PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\bin;%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\libnvvp;%PATH%"
echo CUDA Version 12.3.0>"%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\version.txt"
echo CUDA Version 12.3.1>"%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\version.txt"
)
set "CCACHE_DIR=%USERPROFILE%\ccache"
set "PATH=C:\hostedtoolcache\windows\Python\3.9.13\x64;C:\msys64\%MSYSTEM%\bin;C:\msys64\usr\bin;%ProgramFiles%\apache-maven-3.6.3\bin;%PATH%"
Expand Down
4 changes: 2 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@

* Include `timeapi.h` for system API of Windows ([pull #1447](https://github.com/bytedeco/javacpp-presets/pull/1447))
* Add Android and Windows builds to presets for DepthAI ([pull #1441](https://github.com/bytedeco/javacpp-presets/pull/1441))
* Add presets for nvCOMP 3.0.4 ([pull #1434](https://github.com/bytedeco/javacpp-presets/pull/1434)), SentencePiece 0.1.99 ([pull #1384](https://github.com/bytedeco/javacpp-presets/pull/1384))
* Add presets for nvCOMP 3.0.5 ([pull #1434](https://github.com/bytedeco/javacpp-presets/pull/1434)), SentencePiece 0.1.99 ([pull #1384](https://github.com/bytedeco/javacpp-presets/pull/1384))
* Refactor and improve presets for PyTorch ([pull #1360](https://github.com/bytedeco/javacpp-presets/pull/1360))
* Include `mkl_lapack.h` header file in presets for MKL ([issue #1388](https://github.com/bytedeco/javacpp-presets/issues/1388))
* Map new higher-level C++ API of Triton Inference Server ([pull #1361](https://github.com/bytedeco/javacpp-presets/pull/1361))
* Upgrade presets for OpenCV 4.8.1, FFmpeg 6.1, HDF5 1.14.3, MKL 2024.0, DNNL 3.3.3, OpenBLAS 0.3.25, ARPACK-NG 3.9.1, CPython 3.12.1, NumPy 1.26.2, SciPy 1.11.4, LLVM 17.0.6, Leptonica 1.83.1, Tesseract 5.3.3, CUDA 12.3.0, cuDNN 8.9.5, NCCL 2.18.5, PyTorch 2.1.2 ([pull #1426](https://github.com/bytedeco/javacpp-presets/pull/1426)), TensorFlow Lite 2.15.0, Triton Inference Server 2.38.0, DepthAI 2.23.0, ONNX 1.15.0, ONNX Runtime 1.16.3, TVM 0.14.0, and their dependencies
* Upgrade presets for OpenCV 4.8.1, FFmpeg 6.1, HDF5 1.14.3, MKL 2024.0, DNNL 3.3.3, OpenBLAS 0.3.25, ARPACK-NG 3.9.1, CPython 3.12.1, NumPy 1.26.2, SciPy 1.11.4, LLVM 17.0.6, Leptonica 1.83.1, Tesseract 5.3.3, CUDA 12.3.1, cuDNN 8.9.7, NCCL 2.19.3, PyTorch 2.1.2 ([pull #1426](https://github.com/bytedeco/javacpp-presets/pull/1426)), TensorFlow Lite 2.15.0, Triton Inference Server 2.38.0, DepthAI 2.24.0, ONNX 1.15.0, ONNX Runtime 1.16.3, TVM 0.14.0, and their dependencies

### June 6, 2023 version 1.5.9
* Virtualize `nvinfer1::IGpuAllocator` from TensorRT to allow customization ([pull #1367](https://github.com/bytedeco/javacpp-presets/pull/1367))
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ Each child module in turn relies by default on the included [`cppbuild.sh` scrip
* TensorRT 8.6.x https://developer.nvidia.com/tensorrt
* Triton Inference Server 2.38.x https://developer.nvidia.com/nvidia-triton-inference-server
* The Arcade Learning Environment 0.8.x https://github.com/mgbellemare/Arcade-Learning-Environment
* DepthAI 2.23.x https://github.com/luxonis/depthai-core
* DepthAI 2.24.x https://github.com/luxonis/depthai-core
* ONNX 1.15.x https://github.com/onnx/onnx
* nGraph 0.26.0 https://github.com/NervanaSystems/ngraph
* ONNX Runtime 1.16.x https://github.com/microsoft/onnxruntime
Expand Down
8 changes: 4 additions & 4 deletions cuda/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@ Introduction
------------
This directory contains the JavaCPP Presets module for:

* CUDA 12.3.0 https://developer.nvidia.com/cuda-zone
* cuDNN 8.9.5 https://developer.nvidia.com/cudnn
* NCCL 2.18.5 https://developer.nvidia.com/nccl
* nvCOMP 3.0.4 https://developer.nvidia.com/nvcomp
* CUDA 12.3.1 https://developer.nvidia.com/cuda-zone
* cuDNN 8.9.7 https://developer.nvidia.com/cudnn
* NCCL 2.19.3 https://developer.nvidia.com/nccl
* nvCOMP 3.0.5 https://developer.nvidia.com/nvcomp

Please refer to the parent README.md file for more detailed information about the JavaCPP Presets.

Expand Down
6 changes: 3 additions & 3 deletions cuda/src/gen/java/org/bytedeco/cuda/global/cublas.java
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,8 @@ public class cublas extends org.bytedeco.cuda.presets.cublas {

public static final int CUBLAS_VER_MAJOR = 12;
public static final int CUBLAS_VER_MINOR = 3;
public static final int CUBLAS_VER_PATCH = 2;
public static final int CUBLAS_VER_BUILD = 9;
public static final int CUBLAS_VER_PATCH = 4;
public static final int CUBLAS_VER_BUILD = 1;
public static final int CUBLAS_VERSION = (CUBLAS_VER_MAJOR * 10000 + CUBLAS_VER_MINOR * 100 + CUBLAS_VER_PATCH);

/* CUBLAS status type returns */
Expand Down Expand Up @@ -14009,7 +14009,7 @@ public static native void cublasZtrmm(@Cast("char") byte side,
// #include "driver_types.h"
// #include "cuComplex.h" /* import complex data type */

// #include "cublas_api.h"
// #include "cublas_v2.h"

// #if defined(__cplusplus)
// Targeting ../cublas/cublasXtContext.java
Expand Down
23 changes: 9 additions & 14 deletions cuda/src/gen/java/org/bytedeco/cuda/global/cudart.java
Original file line number Diff line number Diff line change
Expand Up @@ -1496,8 +1496,9 @@ public class cudart extends org.bytedeco.cuda.presets.cudart {
/**
* This option hints to the JIT compiler the minimum number of CTAs from the
* kernel’s grid to be mapped to a SM. Optimizations based on this option
* need either ::CU_JIT_MAX_REGISTERS or ::CU_JIT_THREADS_PER_BLOCK to be
* specified as well.
* need the maximum number of threads per block to be specified as well. This
* option is ignored when used together with ::CU_JIT_MAX_REGISTERS or
* ::CU_JIT_THREADS_PER_BLOCK.
* Option type: unsigned int\n
* Applies to: compiler only
*/
Expand Down Expand Up @@ -1995,7 +1996,7 @@ interop event. The event must disable timing (i.e.
B has a launch completion dependency on a kernel A, B may wait
until A is complete. Alternatively, blocks of B may begin before
all blocks of A have begun, for example if B can claim execution
resources unavaiable to A (e.g. they run on different GPUs) or
resources unavailable to A (e.g. they run on different GPUs) or
if B is a higher priority than A.
Exercise caution if such an ordering inversion could lead
to deadlock.
Expand Down Expand Up @@ -25512,11 +25513,8 @@ inside of a graph. The graph(s) contained within the body of the conditional nod
Only one instantiation of the graph may exist at any point in time.
The graph cannot be cloned.

To set the control value:

In a kernel or kernels at appropriate locations in the graph, insert a call to
{@code void cudaGraphSetConditional(cudaGraphConditionalHandle handle, unsigned int value)}.
Supply a default value when creating the handle. */
To set the control value, supply a default value when creating the handle and/or
call ::cudaGraphSetConditional from device code.*/
cudaGraphNodeTypeConditional = 0x0d,
cudaGraphNodeTypeCount = 0x0d + 1;
// Targeting ../cudart/cudaChildGraphNodeParams.java
Expand Down Expand Up @@ -25790,12 +25788,9 @@ interop event. The event must disable timing (i.e.
have begun execution. Currently this is a best effort. If a kernel
B has a launch completion dependency on a kernel A, B may wait
until A is complete. Alternatively, blocks of B may begin before
all blocks of A have begun, for example:
<ul>
<li>If B can claim execution resources unavaiable to A, for
example if they run on different GPUs.
<li>If B is a higher priority than A.
</ul>
all blocks of A have begun, for example if B can claim execution
resources unavailable to A (e.g. they run on different GPUs) or
if B is a higher priority than A.
Exercise caution if such an ordering inversion could lead
to deadlock.
<br>
Expand Down
Loading

0 comments on commit a885e6c

Please sign in to comment.