From d3e23051826d9cbdd6153032511900861a3788b8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=B6ren=20Brunk?= <soeren@brunk.io>
Date: Fri, 23 Jun 2023 19:19:33 +0200
Subject: [PATCH 1/3] Use cuda 12 and add variable for cuda version

---
 build.sbt            |  6 ++++--
 docs/installation.md | 10 +++++-----
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/build.sbt b/build.sbt
index fba97ec6..65397499 100644
--- a/build.sbt
+++ b/build.sbt
@@ -25,6 +25,7 @@ ThisBuild / apiURL := Some(new URL("https://storch.dev/api/"))
 
 val scrImageVersion = "4.0.34"
 val pytorchVersion = "2.0.1"
+val cudaVersion = "12.1-8.9"
 val openblasVersion = "0.3.23"
 val mklVersion = "2023.1"
 ThisBuild / scalaVersion := "3.3.0"
@@ -74,7 +75,7 @@ lazy val core = project
       (if (enableGPU.value) "pytorch-gpu" else "pytorch") -> pytorchVersion,
       "mkl" -> mklVersion,
       "openblas" -> openblasVersion
-    ) ++ (if (enableGPU.value) Seq("cuda-redist" -> "11.8-8.6") else Seq()),
+    ) ++ (if (enableGPU.value) Seq("cuda-redist" -> "12.1-8.9") else Seq()),
     javaCppPlatform := org.bytedeco.sbt.javacpp.Platform.current,
     fork := true,
     Test / fork := true,
@@ -125,7 +126,8 @@ lazy val docs = project
       "JAVACPP_VERSION" -> javaCppVersion.value,
       "PYTORCH_VERSION" -> pytorchVersion,
       "OPENBLAS_VERSION" -> openblasVersion,
-      "MKL_VERSION" -> mklVersion
+      "MKL_VERSION" -> mklVersion,
+      "CUDA_VERSION" -> cudaVersion
     ),
     ScalaUnidoc / unidoc / unidocProjectFilter := inAnyProject -- inProjects(examples),
     Laika / sourceDirectories ++= Seq(sourceDirectory.value),
diff --git a/docs/installation.md b/docs/installation.md
index 6c0b4539..2929f31a 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -153,7 +153,7 @@ resolvers ++= Resolver.sonatypeOssRepos("snapshots")
 libraryDependencies += Seq(
   "dev.storch" %% "core" % "@VERSION@",
   "org.bytedeco" % "pytorch-platform-gpu" % "@PYTORCH_VERSION@-@JAVACPP_VERSION@",
-  "org.bytedeco" % "cuda-platform-redist" % "11.8-8.6-@JAVACPP_VERSION@"
+  "org.bytedeco" % "cuda-platform-redist" % "@CUDA_VERSION@-@JAVACPP_VERSION@"
 )
 fork := true
 ```
@@ -165,7 +165,7 @@ fork := true
 //> using repository "sonatype:snapshots"
 //> using lib "dev.storch::core:@VERSION@"
 //> using lib "org.bytedeco:pytorch-platform-gpu:@PYTORCH_VERSION@-@JAVACPP_VERSION@"
-//> using lib "org.bytedeco:cuda-platform-redist:11.8-8.6-@JAVACPP_VERSION@"
+//> using lib "org.bytedeco:cuda-platform-redist:@CUDA_VERSION@-@JAVACPP_VERSION@"
 ```
 
 @:@
@@ -189,7 +189,7 @@ libraryDependencies += Seq(
   "org.bytedeco" % "pytorch" % "@PYTORCH_VERSION@-@JAVACPP_VERSION@",
   "org.bytedeco" % "pytorch" % "@PYTORCH_VERSION@-@JAVACPP_VERSION@" classifier "linux-x86_64-gpu",
   "org.bytedeco" % "openblas" % "@OPENBLAS_VERSION@-@JAVACPP_VERSION@" classifier "linux-x86_64",
-  "org.bytedeco" % "cuda" % "11.8-8.6-@JAVACPP_VERSION@" classifier "linux-x86_64-redist"
+  "org.bytedeco" % "cuda" % "@CUDA_VERSION@-@JAVACPP_VERSION@" classifier "linux-x86_64-redist"
 )
 fork := true
 ```
@@ -202,7 +202,7 @@ fork := true
 //> using lib "dev.storch::core:@VERSION@"
 //> using lib "org.bytedeco:pytorch:@PYTORCH_VERSION@-@JAVACPP_VERSION@,classifier=linux-x86_64-gpu"
 //> using lib "org.bytedeco:openblas:@OPENBLAS_VERSION@-@JAVACPP_VERSION@,classifier=linux-x86_64"
-//> using lib "org.bytedeco:cuda:11.8-8.6-@JAVACPP_VERSION@,classifier=linux-x86_64-redist"
+//> using lib "org.bytedeco:cuda:@CUDA_VERSION@-@JAVACPP_VERSION@,classifier=linux-x86_64-redist"
 ```
 
 @:@
@@ -223,7 +223,7 @@ resolvers ++= Resolver.sonatypeOssRepos("snapshots")
 libraryDependencies += Seq(
   "dev.storch" %% "core" % "@VERSION@",
 )
-javaCppPresetLibs ++= Seq("pytorch-gpu" -> "@PYTORCH_VERSION@", "openblas" -> "@OPENBLAS_VERSION@", "cuda-redist" -> "11.8-8.6")
+javaCppPresetLibs ++= Seq("pytorch-gpu" -> "@PYTORCH_VERSION@", "openblas" -> "@OPENBLAS_VERSION@", "cuda-redist" -> "@CUDA_VERSION@")
 fork := true
 ```
 

From 1278777cd91d96caf99a62588949313168bec9f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=B6ren=20Brunk?= <soeren@brunk.io>
Date: Sun, 25 Jun 2023 13:35:24 +0200
Subject: [PATCH 2/3] Add workaround for missing libcusolver after cuda 12
 upgrade

See https://github.com/bytedeco/javacpp-presets/issues/1376
---
 build.sbt                                     |  3 +-
 core/src/main/scala/torch/Tensor.scala        |  3 ++
 .../scala/torch/internal/LoadCusolver.scala   | 28 +++++++++++++++++++
 .../torch/internal/NativeConverters.scala     |  2 ++
 docs/faq.md                                   | 28 +++++++++++++++++++
 5 files changed, 63 insertions(+), 1 deletion(-)
 create mode 100644 core/src/main/scala/torch/internal/LoadCusolver.scala
 create mode 100644 docs/faq.md

diff --git a/build.sbt b/build.sbt
index 65397499..30a36870 100644
--- a/build.sbt
+++ b/build.sbt
@@ -75,7 +75,8 @@ lazy val core = project
       (if (enableGPU.value) "pytorch-gpu" else "pytorch") -> pytorchVersion,
       "mkl" -> mklVersion,
       "openblas" -> openblasVersion
-    ) ++ (if (enableGPU.value) Seq("cuda-redist" -> "12.1-8.9") else Seq()),
+      // TODO remove cuda (not cuda-redist) once https://github.com/bytedeco/javacpp-presets/issues/1376 is fixed
+    ) ++ (if (enableGPU.value) Seq("cuda-redist" -> cudaVersion, "cuda" -> cudaVersion) else Seq()),
     javaCppPlatform := org.bytedeco.sbt.javacpp.Platform.current,
     fork := true,
     Test / fork := true,
diff --git a/core/src/main/scala/torch/Tensor.scala b/core/src/main/scala/torch/Tensor.scala
index bd765580..9cb4b010 100644
--- a/core/src/main/scala/torch/Tensor.scala
+++ b/core/src/main/scala/torch/Tensor.scala
@@ -64,6 +64,7 @@ import org.bytedeco.pytorch.DoubleArrayRef
 import org.bytedeco.pytorch.EllipsisIndexType
 import org.bytedeco.pytorch.SymInt
 import org.bytedeco.pytorch.SymIntOptional
+import internal.LoadCusolver
 
 case class TensorTuple[D <: DType](
     values: Tensor[D],
@@ -729,6 +730,8 @@ type IntTensor = UInt8Tensor | Int8Tensor | Int16Tensor | Int32Tensor | Int64Ten
 type ComplexTensor = Complex32Tensor | Complex64Tensor | Complex128Tensor
 
 object Tensor:
+  LoadCusolver // TODO workaround for https://github.com/bytedeco/javacpp-presets/issues/1376
+
   def apply[D <: DType](native: pytorch.Tensor): Tensor[D] = (native.scalar_type().intern() match
     case ScalarType.Byte          => new UInt8Tensor(native)
     case ScalarType.Char          => new Int8Tensor(native)
diff --git a/core/src/main/scala/torch/internal/LoadCusolver.scala b/core/src/main/scala/torch/internal/LoadCusolver.scala
new file mode 100644
index 00000000..f1d125de
--- /dev/null
+++ b/core/src/main/scala/torch/internal/LoadCusolver.scala
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2022 storch.dev
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package torch
+package internal
+
+import org.bytedeco.javacpp.Loader
+import org.bytedeco.cuda.global.cusolver
+import org.bytedeco.cuda.global.cudart
+
+// This is a workaround for https://github.com/bytedeco/javacpp-presets/issues/1376
+// TODO remove once the issue is fixed
+object LoadCusolver {
+  Loader.load(classOf[cusolver])
+}
diff --git a/core/src/main/scala/torch/internal/NativeConverters.scala b/core/src/main/scala/torch/internal/NativeConverters.scala
index 7d82a58d..28967fb2 100644
--- a/core/src/main/scala/torch/internal/NativeConverters.scala
+++ b/core/src/main/scala/torch/internal/NativeConverters.scala
@@ -38,6 +38,8 @@ import scala.annotation.targetName
 
 private[torch] object NativeConverters:
 
+  LoadCusolver // TODO workaround for https://github.com/bytedeco/javacpp-presets/issues/1376
+
   inline def convertToOptional[T, U <: T | Option[T], V >: Null](i: U, f: T => V): V = i match
     case i: Option[T] => i.map(f(_)).orNull
     case i: T         => f(i)
diff --git a/docs/faq.md b/docs/faq.md
new file mode 100644
index 00000000..61fffb5a
--- /dev/null
+++ b/docs/faq.md
@@ -0,0 +1,28 @@
+# Frequently Asked Questions
+
+## Q: I want to run operations on the GPU, but Storch seems to hang?
+
+Depending on your hardware, the CUDA version and capability settings, CUDA might need to do [just-in-time compilation]()
+https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#just-in-time-compilation) of your kernels, which
+can take a few minutes. The result is cached, so it should load faster on subsequent runs.
+
+If you're unsure, you can watch the size of the cache:
+
+```bash
+watch -d du -sm ~/.nv/ComputeCache
+```
+If it's still growing, it's very likely that CUDA is doing just-in-time compilation.
+
+You can also increase the cache size to up to 4GB, to avoid recomputation:
+
+```bash
+export CUDA_CACHE_MAXSIZE=4294967296
+```
+
+
+## Q: What about GPU support on my Mac?
+
+Recent PyTorch versions provide a new backend based on Apple’s Metal Performance Shaders (MPS).
+The MPS backend enables GPU-accelerated training on the M1/M2 architecture.
+Right now, there's no ARM build of PyTorch in JavaCPP and MPS ist not enabled.
+If you have an M1/M2 machine and want to help, check the umbrella [issue for macosx-aarch64 support](https://github.com/bytedeco/javacpp-presets/issues/1069).
\ No newline at end of file

From a573b3961850712f1bfc959b1e3ebae9718f8b80 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=B6ren=20Brunk?= <soeren@brunk.io>
Date: Sun, 25 Jun 2023 15:09:51 +0200
Subject: [PATCH 3/3] Only load native cusolver if it's available on the
 classpath

---
 core/src/main/scala/torch/internal/LoadCusolver.scala | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/torch/internal/LoadCusolver.scala b/core/src/main/scala/torch/internal/LoadCusolver.scala
index f1d125de..cf832c9b 100644
--- a/core/src/main/scala/torch/internal/LoadCusolver.scala
+++ b/core/src/main/scala/torch/internal/LoadCusolver.scala
@@ -18,11 +18,15 @@ package torch
 package internal
 
 import org.bytedeco.javacpp.Loader
-import org.bytedeco.cuda.global.cusolver
-import org.bytedeco.cuda.global.cudart
 
 // This is a workaround for https://github.com/bytedeco/javacpp-presets/issues/1376
 // TODO remove once the issue is fixed
 object LoadCusolver {
-  Loader.load(classOf[cusolver])
+  try {
+    val cusolver = Class.forName("org.bytedeco.cuda.global.cusolver")
+    Loader.load(cusolver)
+  } catch {
+    case e: ClassNotFoundException => // ignore to avoid breaking CPU only builds
+  }
+
 }