From d3e23051826d9cbdd6153032511900861a3788b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B6ren=20Brunk?= Date: Fri, 23 Jun 2023 19:19:33 +0200 Subject: [PATCH 1/3] Use cuda 12 and add variable for cuda version --- build.sbt | 6 ++++-- docs/installation.md | 10 +++++----- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/build.sbt b/build.sbt index fba97ec6..65397499 100644 --- a/build.sbt +++ b/build.sbt @@ -25,6 +25,7 @@ ThisBuild / apiURL := Some(new URL("https://storch.dev/api/")) val scrImageVersion = "4.0.34" val pytorchVersion = "2.0.1" +val cudaVersion = "12.1-8.9" val openblasVersion = "0.3.23" val mklVersion = "2023.1" ThisBuild / scalaVersion := "3.3.0" @@ -74,7 +75,7 @@ lazy val core = project (if (enableGPU.value) "pytorch-gpu" else "pytorch") -> pytorchVersion, "mkl" -> mklVersion, "openblas" -> openblasVersion - ) ++ (if (enableGPU.value) Seq("cuda-redist" -> "11.8-8.6") else Seq()), + ) ++ (if (enableGPU.value) Seq("cuda-redist" -> "12.1-8.9") else Seq()), javaCppPlatform := org.bytedeco.sbt.javacpp.Platform.current, fork := true, Test / fork := true, @@ -125,7 +126,8 @@ lazy val docs = project "JAVACPP_VERSION" -> javaCppVersion.value, "PYTORCH_VERSION" -> pytorchVersion, "OPENBLAS_VERSION" -> openblasVersion, - "MKL_VERSION" -> mklVersion + "MKL_VERSION" -> mklVersion, + "CUDA_VERSION" -> cudaVersion ), ScalaUnidoc / unidoc / unidocProjectFilter := inAnyProject -- inProjects(examples), Laika / sourceDirectories ++= Seq(sourceDirectory.value), diff --git a/docs/installation.md b/docs/installation.md index 6c0b4539..2929f31a 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -153,7 +153,7 @@ resolvers ++= Resolver.sonatypeOssRepos("snapshots") libraryDependencies += Seq( "dev.storch" %% "core" % "@VERSION@", "org.bytedeco" % "pytorch-platform-gpu" % "@PYTORCH_VERSION@-@JAVACPP_VERSION@", - "org.bytedeco" % "cuda-platform-redist" % "11.8-8.6-@JAVACPP_VERSION@" + "org.bytedeco" % "cuda-platform-redist" % "@CUDA_VERSION@-@JAVACPP_VERSION@" ) fork := true ``` @@ -165,7 +165,7 @@ fork := true //> using repository "sonatype:snapshots" //> using lib "dev.storch::core:@VERSION@" //> using lib "org.bytedeco:pytorch-platform-gpu:@PYTORCH_VERSION@-@JAVACPP_VERSION@" -//> using lib "org.bytedeco:cuda-platform-redist:11.8-8.6-@JAVACPP_VERSION@" +//> using lib "org.bytedeco:cuda-platform-redist:@CUDA_VERSION@-@JAVACPP_VERSION@" ``` @:@ @@ -189,7 +189,7 @@ libraryDependencies += Seq( "org.bytedeco" % "pytorch" % "@PYTORCH_VERSION@-@JAVACPP_VERSION@", "org.bytedeco" % "pytorch" % "@PYTORCH_VERSION@-@JAVACPP_VERSION@" classifier "linux-x86_64-gpu", "org.bytedeco" % "openblas" % "@OPENBLAS_VERSION@-@JAVACPP_VERSION@" classifier "linux-x86_64", - "org.bytedeco" % "cuda" % "11.8-8.6-@JAVACPP_VERSION@" classifier "linux-x86_64-redist" + "org.bytedeco" % "cuda" % "@CUDA_VERSION@-@JAVACPP_VERSION@" classifier "linux-x86_64-redist" ) fork := true ``` @@ -202,7 +202,7 @@ fork := true //> using lib "dev.storch::core:@VERSION@" //> using lib "org.bytedeco:pytorch:@PYTORCH_VERSION@-@JAVACPP_VERSION@,classifier=linux-x86_64-gpu" //> using lib "org.bytedeco:openblas:@OPENBLAS_VERSION@-@JAVACPP_VERSION@,classifier=linux-x86_64" -//> using lib "org.bytedeco:cuda:11.8-8.6-@JAVACPP_VERSION@,classifier=linux-x86_64-redist" +//> using lib "org.bytedeco:cuda:@CUDA_VERSION@-@JAVACPP_VERSION@,classifier=linux-x86_64-redist" ``` @:@ @@ -223,7 +223,7 @@ resolvers ++= Resolver.sonatypeOssRepos("snapshots") libraryDependencies += Seq( "dev.storch" %% "core" % "@VERSION@", ) -javaCppPresetLibs ++= Seq("pytorch-gpu" -> "@PYTORCH_VERSION@", "openblas" -> "@OPENBLAS_VERSION@", "cuda-redist" -> "11.8-8.6") +javaCppPresetLibs ++= Seq("pytorch-gpu" -> "@PYTORCH_VERSION@", "openblas" -> "@OPENBLAS_VERSION@", "cuda-redist" -> "@CUDA_VERSION@") fork := true ``` From 1278777cd91d96caf99a62588949313168bec9f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B6ren=20Brunk?= Date: Sun, 25 Jun 2023 13:35:24 +0200 Subject: [PATCH 2/3] Add workaround for missing libcusolver after cuda 12 upgrade See https://github.com/bytedeco/javacpp-presets/issues/1376 --- build.sbt | 3 +- core/src/main/scala/torch/Tensor.scala | 3 ++ .../scala/torch/internal/LoadCusolver.scala | 28 +++++++++++++++++++ .../torch/internal/NativeConverters.scala | 2 ++ docs/faq.md | 28 +++++++++++++++++++ 5 files changed, 63 insertions(+), 1 deletion(-) create mode 100644 core/src/main/scala/torch/internal/LoadCusolver.scala create mode 100644 docs/faq.md diff --git a/build.sbt b/build.sbt index 65397499..30a36870 100644 --- a/build.sbt +++ b/build.sbt @@ -75,7 +75,8 @@ lazy val core = project (if (enableGPU.value) "pytorch-gpu" else "pytorch") -> pytorchVersion, "mkl" -> mklVersion, "openblas" -> openblasVersion - ) ++ (if (enableGPU.value) Seq("cuda-redist" -> "12.1-8.9") else Seq()), + // TODO remove cuda (not cuda-redist) once https://github.com/bytedeco/javacpp-presets/issues/1376 is fixed + ) ++ (if (enableGPU.value) Seq("cuda-redist" -> cudaVersion, "cuda" -> cudaVersion) else Seq()), javaCppPlatform := org.bytedeco.sbt.javacpp.Platform.current, fork := true, Test / fork := true, diff --git a/core/src/main/scala/torch/Tensor.scala b/core/src/main/scala/torch/Tensor.scala index bd765580..9cb4b010 100644 --- a/core/src/main/scala/torch/Tensor.scala +++ b/core/src/main/scala/torch/Tensor.scala @@ -64,6 +64,7 @@ import org.bytedeco.pytorch.DoubleArrayRef import org.bytedeco.pytorch.EllipsisIndexType import org.bytedeco.pytorch.SymInt import org.bytedeco.pytorch.SymIntOptional +import internal.LoadCusolver case class TensorTuple[D <: DType]( values: Tensor[D], @@ -729,6 +730,8 @@ type IntTensor = UInt8Tensor | Int8Tensor | Int16Tensor | Int32Tensor | Int64Ten type ComplexTensor = Complex32Tensor | Complex64Tensor | Complex128Tensor object Tensor: + LoadCusolver // TODO workaround for https://github.com/bytedeco/javacpp-presets/issues/1376 + def apply[D <: DType](native: pytorch.Tensor): Tensor[D] = (native.scalar_type().intern() match case ScalarType.Byte => new UInt8Tensor(native) case ScalarType.Char => new Int8Tensor(native) diff --git a/core/src/main/scala/torch/internal/LoadCusolver.scala b/core/src/main/scala/torch/internal/LoadCusolver.scala new file mode 100644 index 00000000..f1d125de --- /dev/null +++ b/core/src/main/scala/torch/internal/LoadCusolver.scala @@ -0,0 +1,28 @@ +/* + * Copyright 2022 storch.dev + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package torch +package internal + +import org.bytedeco.javacpp.Loader +import org.bytedeco.cuda.global.cusolver +import org.bytedeco.cuda.global.cudart + +// This is a workaround for https://github.com/bytedeco/javacpp-presets/issues/1376 +// TODO remove once the issue is fixed +object LoadCusolver { + Loader.load(classOf[cusolver]) +} diff --git a/core/src/main/scala/torch/internal/NativeConverters.scala b/core/src/main/scala/torch/internal/NativeConverters.scala index 7d82a58d..28967fb2 100644 --- a/core/src/main/scala/torch/internal/NativeConverters.scala +++ b/core/src/main/scala/torch/internal/NativeConverters.scala @@ -38,6 +38,8 @@ import scala.annotation.targetName private[torch] object NativeConverters: + LoadCusolver // TODO workaround for https://github.com/bytedeco/javacpp-presets/issues/1376 + inline def convertToOptional[T, U <: T | Option[T], V >: Null](i: U, f: T => V): V = i match case i: Option[T] => i.map(f(_)).orNull case i: T => f(i) diff --git a/docs/faq.md b/docs/faq.md new file mode 100644 index 00000000..61fffb5a --- /dev/null +++ b/docs/faq.md @@ -0,0 +1,28 @@ +# Frequently Asked Questions + +## Q: I want to run operations on the GPU, but Storch seems to hang? + +Depending on your hardware, the CUDA version and capability settings, CUDA might need to do [just-in-time compilation]() +https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#just-in-time-compilation) of your kernels, which +can take a few minutes. The result is cached, so it should load faster on subsequent runs. + +If you're unsure, you can watch the size of the cache: + +```bash +watch -d du -sm ~/.nv/ComputeCache +``` +If it's still growing, it's very likely that CUDA is doing just-in-time compilation. + +You can also increase the cache size to up to 4GB, to avoid recomputation: + +```bash +export CUDA_CACHE_MAXSIZE=4294967296 +``` + + +## Q: What about GPU support on my Mac? + +Recent PyTorch versions provide a new backend based on Appleā€™s Metal Performance Shaders (MPS). +The MPS backend enables GPU-accelerated training on the M1/M2 architecture. +Right now, there's no ARM build of PyTorch in JavaCPP and MPS ist not enabled. +If you have an M1/M2 machine and want to help, check the umbrella [issue for macosx-aarch64 support](https://github.com/bytedeco/javacpp-presets/issues/1069). \ No newline at end of file From a573b3961850712f1bfc959b1e3ebae9718f8b80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B6ren=20Brunk?= Date: Sun, 25 Jun 2023 15:09:51 +0200 Subject: [PATCH 3/3] Only load native cusolver if it's available on the classpath --- core/src/main/scala/torch/internal/LoadCusolver.scala | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/core/src/main/scala/torch/internal/LoadCusolver.scala b/core/src/main/scala/torch/internal/LoadCusolver.scala index f1d125de..cf832c9b 100644 --- a/core/src/main/scala/torch/internal/LoadCusolver.scala +++ b/core/src/main/scala/torch/internal/LoadCusolver.scala @@ -18,11 +18,15 @@ package torch package internal import org.bytedeco.javacpp.Loader -import org.bytedeco.cuda.global.cusolver -import org.bytedeco.cuda.global.cudart // This is a workaround for https://github.com/bytedeco/javacpp-presets/issues/1376 // TODO remove once the issue is fixed object LoadCusolver { - Loader.load(classOf[cusolver]) + try { + val cusolver = Class.forName("org.bytedeco.cuda.global.cusolver") + Loader.load(cusolver) + } catch { + case e: ClassNotFoundException => // ignore to avoid breaking CPU only builds + } + }