From e28e2a7e193e40b6f785d821b225ab7dbe61e83c Mon Sep 17 00:00:00 2001 From: Christoph Schranz Date: Tue, 19 Dec 2023 16:22:14 +0100 Subject: [PATCH] update libcudnn8 to install TF RTX requirements --- .build/Dockerfile | 7 ++ extra/Getting_Started/GPU-processing.ipynb | 138 ++++++++++++--------- src/Dockerfile.gpulibs | 7 ++ 3 files changed, 92 insertions(+), 60 deletions(-) diff --git a/.build/Dockerfile b/.build/Dockerfile index 763b013..1d45e7b 100755 --- a/.build/Dockerfile +++ b/.build/Dockerfile @@ -380,6 +380,13 @@ LABEL maintainer="Christoph Schranz , Mat # Install Tensorflow, check compatibility here: # https://www.tensorflow.org/install/source#gpu # installation via conda leads to errors in version 4.8.2 +# Install CUDA-specific nvidia libraries and update libcudnn8 before that +USER root +RUN apt-get update && \ + apt-get install -y --no-install-recommends --allow-change-held-packages libcudnn8 && \ + apt-get install -y --no-install-recommends libnvinfer-dev libnvinfer-plugin-dev && \ + apt-get clean && rm -rf /var/lib/apt/lists/* +RUN cd /usr/lib/x86_64-linux-gnu && ln -s libnvinfer_plugin.so.8 libnvinfer_plugin.so.7 && ln -s libnvinfer.so.8 libnvinfer.so.7 USER ${NB_UID} RUN pip install --upgrade pip && \ pip install --no-cache-dir tensorflow==2.15.0 keras==2.15.0 && \ diff --git a/extra/Getting_Started/GPU-processing.ipynb b/extra/Getting_Started/GPU-processing.ipynb index 3aeaa51..fb97168 100755 --- a/extra/Getting_Started/GPU-processing.ipynb +++ b/extra/Getting_Started/GPU-processing.ipynb @@ -27,20 +27,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "Thu Dec 14 17:16:30 2023 \n", + "Tue Dec 19 15:20:27 2023 \n", "+-----------------------------------------------------------------------------+\n", - "| NVIDIA-SMI 520.61.05 Driver Version: 520.61.05 CUDA Version: 11.8 |\n", + "| NVIDIA-SMI 525.54 Driver Version: 526.56 CUDA Version: 12.0 |\n", "|-------------------------------+----------------------+----------------------+\n", "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", "| | | MIG M. |\n", "|===============================+======================+======================|\n", - "| 0 NVIDIA RTX A6000 On | 00000000:41:00.0 Off | Off |\n", - "| 30% 49C P8 27W / 300W | 5MiB / 49140MiB | 0% Default |\n", - "| | | N/A |\n", - "+-------------------------------+----------------------+----------------------+\n", - "| 1 NVIDIA RTX A6000 On | 00000000:61:00.0 Off | Off |\n", - "| 35% 63C P2 90W / 300W | 9635MiB / 49140MiB | 0% Default |\n", + "| 0 NVIDIA GeForce ... On | 00000000:01:00.0 Off | N/A |\n", + "| N/A 43C P0 19W / N/A | 0MiB / 6144MiB | 0% Default |\n", "| | | N/A |\n", "+-------------------------------+----------------------+----------------------+\n", " \n", @@ -49,6 +45,7 @@ "| GPU GI CI PID Type Process name GPU Memory |\n", "| ID ID Usage |\n", "|=============================================================================|\n", + "| No running processes found |\n", "+-----------------------------------------------------------------------------+\n" ] } @@ -94,14 +91,11 @@ "name": "stderr", "output_type": "stream", "text": [ - "2023-12-14 17:16:32.448916: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.\n", - "2023-12-14 17:16:32.472734: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", - "2023-12-14 17:16:32.472758: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", - "2023-12-14 17:16:32.473445: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", - "2023-12-14 17:16:32.477355: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.\n", - "2023-12-14 17:16:32.477730: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", - "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", - "2023-12-14 17:16:33.173733: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n" + "2023-12-19 15:20:31.292167: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", + "2023-12-19 15:20:31.292283: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", + "2023-12-19 15:20:31.345027: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", + "2023-12-19 15:20:31.405681: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", + "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" ] }, { @@ -115,14 +109,26 @@ "name": "stderr", "output_type": "stream", "text": [ - "2023-12-14 17:16:33.915841: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n", - "2023-12-14 17:16:33.916057: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n", - "2023-12-14 17:16:33.916926: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2256] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.\n", - "Skipping registering GPU devices...\n", - "2023-12-14 17:16:34.083985: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n", - "2023-12-14 17:16:34.084152: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n", - "2023-12-14 17:16:34.084263: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2256] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.\n", - "Skipping registering GPU devices...\n" + "2023-12-19 15:20:33.437438: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node\n", + "Your kernel may have been built without NUMA support.\n", + "2023-12-19 15:20:33.438375: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node\n", + "Your kernel may have been built without NUMA support.\n", + "2023-12-19 15:20:33.438401: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node\n", + "Your kernel may have been built without NUMA support.\n", + "2023-12-19 15:20:33.439883: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node\n", + "Your kernel may have been built without NUMA support.\n", + "2023-12-19 15:20:33.439914: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node\n", + "Your kernel may have been built without NUMA support.\n", + "2023-12-19 15:20:33.439931: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node\n", + "Your kernel may have been built without NUMA support.\n", + "2023-12-19 15:20:35.742285: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node\n", + "Your kernel may have been built without NUMA support.\n", + "2023-12-19 15:20:35.742336: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node\n", + "Your kernel may have been built without NUMA support.\n", + "2023-12-19 15:20:35.742346: I tensorflow/core/common_runtime/gpu/gpu_device.cc:2022] Could not identify NUMA node of platform GPU id 0, defaulting to 0. Your kernel may not have been built with NUMA support.\n", + "2023-12-19 15:20:35.742371: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node\n", + "Your kernel may have been built without NUMA support.\n", + "2023-12-19 15:20:35.742387: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /device:GPU:0 with 3007 MB memory: -> device: 0, name: NVIDIA GeForce RTX 3060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6\n" ] }, { @@ -133,8 +139,19 @@ " memory_limit: 268435456\n", " locality {\n", " }\n", - " incarnation: 14747982026689315297\n", - " xla_global_id: -1]" + " incarnation: 16132906446837489126\n", + " xla_global_id: -1,\n", + " name: \"/device:GPU:0\"\n", + " device_type: \"GPU\"\n", + " memory_limit: 3153068032\n", + " locality {\n", + " bus_id: 1\n", + " links {\n", + " }\n", + " }\n", + " incarnation: 13564847057713934038\n", + " physical_device_desc: \"device: 0, name: NVIDIA GeForce RTX 3060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6\"\n", + " xla_global_id: 416903419]" ] }, "execution_count": 3, @@ -157,11 +174,11 @@ { "data": { "text/plain": [ - "tensor([[0.3446, 0.0452, 0.2264],\n", - " [0.7986, 0.7481, 0.9437],\n", - " [0.0514, 0.0179, 0.9945],\n", - " [0.6514, 0.9786, 0.4902],\n", - " [0.9525, 0.8661, 0.2606]])" + "tensor([[0.6527, 0.7924, 0.8878],\n", + " [0.7822, 0.0338, 0.4025],\n", + " [0.6130, 0.1177, 0.9255],\n", + " [0.0451, 0.7010, 0.0331],\n", + " [0.1705, 0.8165, 0.7323]])" ] }, "execution_count": 4, @@ -211,7 +228,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "260 ms ± 61.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + "559 ms ± 87.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], @@ -245,7 +262,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "76.7 ms ± 1.28 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + "200 ms ± 4.12 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], @@ -271,16 +288,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "tensor([[0.3524, 0.4564, 0.5821, 0.0973, 0.7754],\n", - " [0.7047, 0.2262, 0.4790, 0.1555, 0.5360],\n", - " [0.0142, 0.1699, 0.9471, 0.2035, 0.9215],\n", - " [0.5230, 0.0497, 0.8534, 0.3936, 0.3059],\n", - " [0.8031, 0.8541, 0.3866, 0.6828, 0.7291]], device='cuda:0')\n", - "tensor([[0.3524, 0.4564, 0.5821, 0.0973, 0.7754],\n", - " [0.7047, 0.2262, 0.4790, 0.1555, 0.5360],\n", - " [0.0142, 0.1699, 0.9471, 0.2035, 0.9215],\n", - " [0.5230, 0.0497, 0.8534, 0.3936, 0.3059],\n", - " [0.8031, 0.8541, 0.3866, 0.6828, 0.7291]], dtype=torch.float64)\n" + "tensor([[0.4609, 0.7584, 0.4593, 0.0551, 0.1594],\n", + " [0.6063, 0.5960, 0.4197, 0.7962, 0.1542],\n", + " [0.5160, 0.4067, 0.6062, 0.1356, 0.8867],\n", + " [0.3636, 0.7090, 0.3487, 0.0552, 0.4904],\n", + " [0.6309, 0.0065, 0.8926, 0.0643, 0.6346]], device='cuda:0')\n", + "tensor([[0.4609, 0.7584, 0.4593, 0.0551, 0.1594],\n", + " [0.6063, 0.5960, 0.4197, 0.7962, 0.1542],\n", + " [0.5160, 0.4067, 0.6062, 0.1356, 0.8867],\n", + " [0.3636, 0.7090, 0.3487, 0.0552, 0.4904],\n", + " [0.6309, 0.0065, 0.8926, 0.0643, 0.6346]], dtype=torch.float64)\n" ] } ], @@ -304,7 +321,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "3.37 ms ± 23.6 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + "The slowest run took 5.26 times longer than the fastest. This could mean that an intermediate result is being cached.\n", + "90.5 ms ± 74.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], @@ -351,11 +369,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "tensor([[0.4466, 0.0260, 0.0687, 0.6375, 0.9676],\n", - " [0.2974, 0.0200, 0.0621, 0.4341, 0.0167],\n", - " [0.1146, 0.3012, 0.9246, 0.1484, 0.8045],\n", - " [0.4448, 0.5577, 0.4649, 0.2364, 0.7051],\n", - " [0.0479, 0.7472, 0.2121, 0.9418, 0.7699]], device='cuda:0')\n" + "tensor([[0.2782, 0.4737, 0.6745, 0.9081, 0.3480],\n", + " [0.1753, 0.6475, 0.3926, 0.3947, 0.9197],\n", + " [0.1747, 0.6550, 0.4903, 0.4221, 0.3066],\n", + " [0.8606, 0.6053, 0.0784, 0.1127, 0.2536],\n", + " [0.0236, 0.5264, 0.6400, 0.5198, 0.5281]], device='cuda:0')\n" ] } ], @@ -386,11 +404,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "tensor([[ 1.2995e-03, 1.6008e-04, 3.7637e-04, 1.3155e-04, 4.5707e-05],\n", - " [ 1.6008e-04, 8.3649e-04, 4.2130e-05, 9.5201e-05, 1.6981e-04],\n", - " [ 3.7637e-04, 4.2130e-05, 1.1736e-03, 3.9943e-04, -2.7599e-04],\n", - " [ 1.3155e-04, 9.5201e-05, 3.9942e-04, 4.7651e-04, 1.6600e-04],\n", - " [ 4.5707e-05, 1.6981e-04, -2.7599e-04, 1.6600e-04, 1.3608e-03]],\n", + "tensor([[ 6.5489e-04, 1.8794e-04, 2.2678e-04, -2.8653e-04, 1.9096e-04],\n", + " [ 1.8794e-04, 7.0443e-04, 2.0275e-04, -2.2673e-04, 2.6704e-04],\n", + " [ 2.2678e-04, 2.0275e-04, 6.8227e-04, -1.5024e-05, 3.2128e-04],\n", + " [-2.8653e-04, -2.2673e-04, -1.5024e-05, 1.1865e-03, 1.9364e-04],\n", + " [ 1.9096e-04, 2.6704e-04, 3.2128e-04, 1.9364e-04, 1.0109e-03]],\n", " device='cuda:0')\n" ] } @@ -409,11 +427,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "tensor([[ 1.2995e-03, 1.6008e-04, 3.7637e-04, 1.3155e-04, 4.5707e-05],\n", - " [ 1.6008e-04, 8.3649e-04, 4.2130e-05, 9.5201e-05, 1.6981e-04],\n", - " [ 3.7637e-04, 4.2130e-05, 1.1736e-03, 3.9943e-04, -2.7599e-04],\n", - " [ 1.3155e-04, 9.5201e-05, 3.9942e-04, 4.7651e-04, 1.6600e-04],\n", - " [ 4.5707e-05, 1.6981e-04, -2.7599e-04, 1.6600e-04, 1.3608e-03]],\n", + "tensor([[ 6.5489e-04, 1.8794e-04, 2.2678e-04, -2.8653e-04, 1.9096e-04],\n", + " [ 1.8794e-04, 7.0443e-04, 2.0275e-04, -2.2673e-04, 2.6704e-04],\n", + " [ 2.2678e-04, 2.0275e-04, 6.8227e-04, -1.5024e-05, 3.2128e-04],\n", + " [-2.8653e-04, -2.2673e-04, -1.5024e-05, 1.1865e-03, 1.9364e-04],\n", + " [ 1.9096e-04, 2.6704e-04, 3.2128e-04, 1.9364e-04, 1.0109e-03]],\n", " dtype=torch.float64)\n" ] } @@ -449,7 +467,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.6" + "version": "3.11.7" } }, "nbformat": 4, diff --git a/src/Dockerfile.gpulibs b/src/Dockerfile.gpulibs index 3c4b18c..a352c4f 100644 --- a/src/Dockerfile.gpulibs +++ b/src/Dockerfile.gpulibs @@ -3,6 +3,13 @@ LABEL maintainer="Christoph Schranz , Mat # Install Tensorflow, check compatibility here: # https://www.tensorflow.org/install/source#gpu # installation via conda leads to errors in version 4.8.2 +# Install CUDA-specific nvidia libraries and update libcudnn8 before that +USER root +RUN apt-get update && \ + apt-get install -y --no-install-recommends --allow-change-held-packages libcudnn8 && \ + apt-get install -y --no-install-recommends libnvinfer-dev libnvinfer-plugin-dev && \ + apt-get clean && rm -rf /var/lib/apt/lists/* +RUN cd /usr/lib/x86_64-linux-gnu && ln -s libnvinfer_plugin.so.8 libnvinfer_plugin.so.7 && ln -s libnvinfer.so.8 libnvinfer.so.7 USER ${NB_UID} RUN pip install --upgrade pip && \ pip install --no-cache-dir tensorflow==2.15.0 keras==2.15.0 && \