bytedeco · saudet · Nov 15, 2023 · Nov 1, 2023 · Nov 2, 2023 · Nov 2, 2023
diff --git a/.github/actions/deploy-centos/action.yml b/.github/actions/deploy-centos/action.yml
@@ -71,7 +71,7 @@ runs:
         fi
 
         if [[ "$CI_DEPLOY_PLATFORM" == "linux-x86_64" ]] && [[ -n ${CI_DEPLOY_NEED_CUDA:-} ]]; then
-          echo Installing CUDA, cuDNN, etc
+          echo Installing CUDA, cuDNN, nvCOMP, etc
           curl -LO https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda-repo-rhel7-12-1-local-12.1.1_530.30.02-1.x86_64.rpm
           curl -LO https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/libcudnn8-8.9.1.23-1.cuda12.1.x86_64.rpm
           curl -LO https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/libcudnn8-devel-8.9.1.23-1.cuda12.1.x86_64.rpm
@@ -86,6 +86,11 @@ runs:
           mv /usr/include/cudnn* /usr/include/nccl* /usr/local/cuda/include/
           mv /usr/lib64/libcudnn* /usr/lib64/libnccl* /usr/local/cuda/lib64/
 
+          curl -LO https://developer.download.nvidia.com/compute/nvcomp/3.0.4/local_installers/nvcomp_3.0.4_x86_64_12.x.tgz
+          tar -xvf nvcomp_3.0.4_x86_64_12.x.tgz -C /usr/local/cuda/lib64/ --strip-components=1 lib/
+          tar -xvf nvcomp_3.0.4_x86_64_12.x.tgz -C /usr/local/cuda/include/ --strip-components=1 include/
+          rm -f nvcomp_3.0.4_x86_64_12.x.tgz
+
           # Work around issues with CUDA 10.2/11.x
           mv /usr/include/cublas* /usr/include/nvblas* /usr/local/cuda/include/ || true
           mv /usr/lib64/libcublas* /usr/lib64/libnvblas* /usr/local/cuda/lib64/ || true
@@ -112,7 +117,7 @@ runs:
           sed -i /warp_merge_sort.cuh/d /usr/local/cuda/include/cub/cub.cuh
 
           # Remove downloaded archives and unused libraries to avoid running out of disk space
-          rm -f $(find /usr/local/cuda/ -name '*.a' -and -not -name libcudart_static.a -and -not -name libcudadevrt.a)
+          rm -f $(find /usr/local/cuda/ -name '*.a' -and -not -name libcudart_static.a -and -not -name libcudadevrt.a -and -not -name libnvcomp_device.a)
         fi
 
         if [[ "$CI_DEPLOY_MODULE" == "nvcodec" ]]; then

diff --git a/.github/actions/deploy-ubuntu/action.yml b/.github/actions/deploy-ubuntu/action.yml
@@ -34,6 +34,7 @@ runs:
           export CUDA=cuda-repo-rhel8-12-3-local-12.3.0_545.23.06-1.aarch64.rpm
           export CUDNN=8.9.5.29-1.cuda12.2.aarch64
           export NCCL=2.18.5-1+cuda12.2.aarch64
+          export NVCOMP=nvcomp_3.0.4_SBSA_12.x
           export USERLAND_BUILDME="buildme --aarch64"
         elif [[ "$CI_DEPLOY_PLATFORM" == "linux-ppc64le" ]]; then
           export ARCH=ppc64el
@@ -52,6 +53,7 @@ runs:
           export CUDA=cuda-repo-rhel8-12-3-local-12.3.0_545.23.06-1.x86_64.rpm
           export CUDNN=8.9.5.29-1.cuda12.2.x86_64
           export NCCL=2.18.5-1+cuda12.2.x86_64
+          export NVCOMP=nvcomp_3.0.4_x86_64_12.x
         fi
         echo "ARCH=$ARCH" >> $GITHUB_ENV
         echo "PREFIX=$PREFIX" >> $GITHUB_ENV
@@ -137,7 +139,7 @@ runs:
         fi
 
         if [[ -n ${ARCH_CUDA:-} ]] && [[ -n ${CI_DEPLOY_NEED_CUDA:-} ]]; then
-          echo Installing CUDA, cuDNN, etc
+          echo Installing CUDA, cuDNN, nvCOMP, etc
           curl -LO https://developer.download.nvidia.com/compute/cuda/12.3.0/local_installers/$CUDA
           curl -LO https://developer.download.nvidia.com/compute/cuda/repos/rhel8/$ARCH_CUDA/libcudnn8-$CUDNN.rpm
           curl -LO https://developer.download.nvidia.com/compute/cuda/repos/rhel8/$ARCH_CUDA/libcudnn8-devel-$CUDNN.rpm
@@ -153,6 +155,13 @@ runs:
           $SUDO mv /usr/include/cudnn* /usr/include/nccl* /usr/local/cuda/include/
           $SUDO mv /usr/lib64/libcudnn* /usr/lib64/libnccl* /usr/local/cuda/lib64/
 
+          if [[ -n ${NVCOMP:-} ]]; then
+            curl -LO https://developer.download.nvidia.com/compute/nvcomp/3.0.4/local_installers/$NVCOMP.tgz
+            $SUDO tar -xvf nvcomp_*.tgz -C /usr/local/cuda/lib64/ --strip-components=1 lib/
+            $SUDO tar -xvf nvcomp_*.tgz -C /usr/local/cuda/include/ --strip-components=1 include/
+            rm -f $NVCOMP.tgz
+          fi
+
           # Work around issues with CUDA 10.2/11.x
           $SUDO mv /usr/include/cublas* /usr/include/nvblas* /usr/local/cuda/include/ || true
           $SUDO mv /usr/lib64/libcublas* /usr/lib64/libnvblas* /usr/local/cuda/lib64/ || true
@@ -179,7 +188,7 @@ runs:
           $SUDO sed -i /warp_merge_sort.cuh/d /usr/local/cuda/include/cub/cub.cuh
 
           # Remove downloaded archives and unused libraries to avoid running out of disk space
-          $SUDO rm -f $(find /usr/local/cuda/ -name '*.a' -and -not -name libcudart_static.a -and -not -name libcudadevrt.a)
+          $SUDO rm -f $(find /usr/local/cuda/ -name '*.a' -and -not -name libcudart_static.a -and -not -name libcudadevrt.a -and -not -name libnvcomp_device.a)
         fi
 
         if [[ "$CI_DEPLOY_MODULE" == "nvcodec" ]]; then

diff --git a/.github/actions/deploy-windows/action.yml b/.github/actions/deploy-windows/action.yml
@@ -90,12 +90,13 @@ runs:
         )
 
         if "%CI_DEPLOY_PLATFORM%"=="windows-x86_64" if not "%CI_DEPLOY_NEED_CUDA%"=="" (
-          echo Installing CUDA, cuDNN, etc
+          echo Installing CUDA, cuDNN, nvCOMP, etc
           curl -LO https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_522.06_windows.exe
           curl -LO https://developer.download.nvidia.com/compute/cuda/12.3.0/local_installers/cuda_12.3.0_545.84_windows.exe
           rem curl -LO https://developer.download.nvidia.com/compute/redist/cudnn/v8.8.0/local_installers/12.0/cudnn_8.8.0.121_windows.exe
           python -m gdown.cli https://drive.google.com/uc?id=1-5QHvwDZC_1rhn5W6fRHNWicXRPtqt31
           curl -LO http://www.winimage.com/zLibDll/zlib123dllx64.zip
+          curl -LO https://developer.download.nvidia.com/compute/nvcomp/3.0.4/local_installers/nvcomp_3.0.4_windows_12.x.zip
           cuda_11.8.0_522.06_windows.exe -s
           bash -c "rm -Rf 'C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.8'"
           bash -c "mv 'C:/Program Files/NVIDIA Corporation/NvToolsExt' 'C:/Program Files/NVIDIA Corporation/NvToolsExt_old'"
@@ -105,13 +106,20 @@ runs:
           rem cudnn_8.8.0.121_windows.exe -s
           unzip cudnn-windows-x86_64-8.9.5.29_cuda12-archive.zip
           unzip zlib123dllx64.zip
+          unzip nvcomp_3.0.4_windows_12.x.zip
           rem move "%ProgramFiles%\NVIDIA\CUDNN\v8.8\bin\*.dll" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\bin"
           rem move "%ProgramFiles%\NVIDIA\CUDNN\v8.8\include\*.h" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\include"
           rem move "%ProgramFiles%\NVIDIA\CUDNN\v8.8\lib\x64\*.lib" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\lib\x64"
           move cudnn-windows-x86_64-8.9.5.29_cuda12-archive\bin\*.dll "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\bin"
           move cudnn-windows-x86_64-8.9.5.29_cuda12-archive\include\*.h "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\include"
           move cudnn-windows-x86_64-8.9.5.29_cuda12-archive\lib\x64\*.lib "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\lib\x64"
           move dll_x64\zlibwapi.dll "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\bin"
+          move include\* "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\include"
+          move include\gdeflate "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\include"
+          move include\native "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\include"
+          move include\nvcomp "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\include"
+          move lib\nvcomp*.dll "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\bin"
+          move lib\nvcomp*.lib "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.3\lib\x64"
 
           rem echo Applying hotfix to Visual Studio 2019 for CUDA
           rem curl -LO https://github.com/raw/microsoft/STL/main/stl/inc/cmath

diff --git a/README.md b/README.md
@@ -219,6 +219,7 @@ Each child module in turn relies by default on the included [`cppbuild.sh` scrip
  * CUDA 12.3.x  https://developer.nvidia.com/cuda-downloads
    * cuDNN 8.9.x  https://developer.nvidia.com/cudnn
    * NCCL 2.18.x  https://developer.nvidia.com/nccl
+   * nvCOMP 3.0.x https://developer.nvidia.com/nvcomp
  * NVIDIA Video Codec SDK 12.1.x  https://developer.nvidia.com/nvidia-video-codec-sdk
  * OpenCL 3.0.x  https://github.com/KhronosGroup/OpenCL-ICD-Loader
  * MXNet 1.9.x  https://github.com/apache/incubator-mxnet

diff --git a/cuda/README.md b/cuda/README.md
@@ -18,6 +18,8 @@ To view the license for cuDNN included in these archives, click [here](https://d
 ### NVIDIA Collective Communications Library (NCCL)
 To view the license for NCCL included in these archives, click [here](https://github.com/NVIDIA/nccl/blob/master/LICENSE.txt)
 
+### NVIDIA nvCOMP
+To view the license for nvCOMP included in these archives, click [here](https://github.com/NVIDIA/nvcomp/blob/main/LICENSE)
 
 Introduction
 ------------
@@ -26,6 +28,7 @@ This directory contains the JavaCPP Presets module for:
  * CUDA 12.3.0  https://developer.nvidia.com/cuda-zone
  * cuDNN 8.9.5  https://developer.nvidia.com/cudnn
  * NCCL 2.18.5  https://developer.nvidia.com/nccl
+ * nvCOMP 3.0.4  https://developer.nvidia.com/nvcomp
 
 Please refer to the parent README.md file for more detailed information about the JavaCPP Presets.
 

diff --git a/cuda/samples/nvcompLZ4Example.java b/cuda/samples/nvcompLZ4Example.java
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+import org.bytedeco.cuda.cudart.CUstream_st;
+import org.bytedeco.cuda.global.nvcomp;
+import org.bytedeco.cuda.nvcomp.*;
+import org.bytedeco.javacpp.BytePointer;
+import org.bytedeco.javacpp.Loader;
+
+import static org.bytedeco.cuda.global.cudart.*;
+import static org.bytedeco.cuda.global.nvcomp.*;
+
+// https://github.com/NVIDIA/nvcomp/blob/main/examples/high_level_quickstart_example.cpp
+public class nvcompLZ4Example {
+    private static void decomp_compressed_with_manager_factory_example(BytePointer device_input_ptrs, long input_buffer_len) {
+        CUstream_st stream = new CUstream_st();
+        int cuda_error = cudaStreamCreate(stream);
+
+        long chunk_size = 1 << 16;
+
+        nvcompBatchedLZ4Opts_t format_opts = new nvcompBatchedLZ4Opts_t();
+        format_opts.data_type(NVCOMP_TYPE_CHAR);
+        LZ4Manager nvcomp_manager = new LZ4Manager(chunk_size, format_opts, stream, 0, nvcomp.NoComputeNoVerify);
+        CompressionConfig comp_config = nvcomp_manager.configure_compression(input_buffer_len);
+
+        BytePointer comp_buffer = new BytePointer();
+        cuda_error = cudaMalloc(comp_buffer, comp_config.max_compressed_buffer_size());
+
+        nvcomp_manager.compress(device_input_ptrs, comp_buffer, comp_config);
+
+        // Construct a new nvcomp manager from the compressed buffer.
+        // Note we could use the nvcomp_manager from above, but here we demonstrate how to create a manager
+        // for the use case where a buffer is received and the user doesn't know how it was compressed
+        // Also note, creating the manager in this way synchronizes the stream, as the compressed buffer must be read to
+        // construct the manager
+        nvcompManagerBase decomp_nvcomp_manager = create_manager(comp_buffer, stream, 0, NoComputeNoVerify);
+
+        DecompressionConfig decomp_config = decomp_nvcomp_manager.configure_decompression(comp_buffer);
+        BytePointer res_decomp_buffer = new BytePointer();
+        cuda_error = cudaMalloc(res_decomp_buffer, decomp_config.decomp_data_size());
+
+        decomp_nvcomp_manager.decompress(res_decomp_buffer, comp_buffer, decomp_config);
+
+        cuda_error = cudaFree(comp_buffer);
+        cuda_error = cudaFree(res_decomp_buffer);
+        cuda_error = cudaStreamSynchronize(stream);
+        cuda_error = cudaStreamDestroy(stream);
+    }
+
+    public static void main(String[] args) {
+        Loader.load(nvcomp.class);
+
+        // Initialize a random array of chars
+        int input_buffer_len = 1000000;
+        byte[] uncompressed_data = new byte[input_buffer_len];
+
+        for (int i = 0; i < input_buffer_len; i++) {
+            uncompressed_data[i] = (byte) (Math.random() * 26 + 'a');
+        }
+
+        BytePointer uncompressed_data_ptr = new BytePointer(uncompressed_data);
+
+        BytePointer device_input_ptrs = new BytePointer();
+
+        int cuda_error = cudaMalloc(device_input_ptrs, input_buffer_len);
+        cuda_error = cudaMemcpy(device_input_ptrs, uncompressed_data_ptr, input_buffer_len, cudaMemcpyDefault);
+
+        decomp_compressed_with_manager_factory_example(device_input_ptrs, input_buffer_len);
+    }
+}