Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature[cartesian]: Add support for HIP/ROCm #1278

Merged
merged 9 commits into from
Aug 3, 2023
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/gt4py/cartesian/backend/dace_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from dace.serialize import dumps

from gt4py import storage as gt_storage
from gt4py.cartesian import config as gt_config
from gt4py.cartesian.backend.base import CLIBackendMixin, register
from gt4py.cartesian.backend.gtc_common import (
BackendCodegen,
Expand Down Expand Up @@ -543,6 +544,8 @@ def keep_line(line):
def apply(cls, stencil_ir: gtir.Stencil, builder: "StencilBuilder", sdfg: dace.SDFG):
self = cls()
with dace.config.temporary_config():
if gt_config.GT4PY_USE_HIP:
dace.config.Config.set("compiler", "cuda", "backend", value="hip")
dace.config.Config.set("compiler", "cuda", "max_concurrent_streams", value=-1)
dace.config.Config.set("compiler", "cpu", "openmp_sections", value=False)
code_objects = sdfg.generate_code()
Expand Down
62 changes: 38 additions & 24 deletions src/gt4py/cartesian/backend/pyext_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,43 +85,48 @@ def get_gt_pyext_build_opts(
"-isystem{}".format(gt_config.build_settings["boost_include_path"]),
"-DBOOST_PP_VARIADICS",
*extra_compile_args_from_config["cxx"],
],
nvcc=[
"-std=c++17",
"-ftemplate-depth={}".format(gt_config.build_settings["cpp_template_depth"]),
]
)
extra_compile_args["cuda"] = [
"-std=c++17",
"-ftemplate-depth={}".format(gt_config.build_settings["cpp_template_depth"]),
"-isystem{}".format(gt_include_path),
"-isystem{}".format(gt_config.build_settings["boost_include_path"]),
havogt marked this conversation as resolved.
Show resolved Hide resolved
"-DBOOST_PP_VARIADICS",
"-DBOOST_OPTIONAL_CONFIG_USE_OLD_IMPLEMENTATION_OF_OPTIONAL",
"-DBOOST_OPTIONAL_USE_OLD_DEFINITION_OF_NONE",
*extra_compile_args_from_config["cuda"],
]
if gt_config.GT4PY_USE_HIP:
extra_compile_args["cuda"] += ["-fvisibility=hidden", "-fPIC"]
else:
extra_compile_args["cuda"] += [
"-arch=sm_{}".format(cuda_arch),
"-isystem={}".format(gt_include_path),
"-isystem={}".format(gt_config.build_settings["boost_include_path"]),
"-DBOOST_PP_VARIADICS",
"-DBOOST_OPTIONAL_CONFIG_USE_OLD_IMPLEMENTATION_OF_OPTIONAL",
"-DBOOST_OPTIONAL_USE_OLD_DEFINITION_OF_NONE",
"--expt-relaxed-constexpr",
"--compiler-options",
"-fvisibility=hidden",
"--compiler-options",
"-fPIC",
*extra_compile_args_from_config["nvcc"],
],
)
]
extra_link_args = gt_config.build_settings["extra_link_args"]

mode_flags = ["-O0", "-ggdb"] if debug_mode else ["-O3", "-DNDEBUG"]
extra_compile_args["cxx"].extend(mode_flags)
extra_compile_args["nvcc"].extend(mode_flags)
extra_compile_args["cuda"].extend(mode_flags)
extra_link_args.extend(mode_flags)

if dace_path := get_dace_module_path():
extra_compile_args["cxx"].append(
"-isystem{}".format(os.path.join(dace_path, "runtime/include"))
)
extra_compile_args["nvcc"].append(
"-isystem={}".format(os.path.join(dace_path, "runtime/include"))
extra_compile_args["cuda"].append(
"-isystem{}".format(os.path.join(dace_path, "runtime/include"))
havogt marked this conversation as resolved.
Show resolved Hide resolved
)

if add_profile_info:
profile_flags = ["-pg"]
extra_compile_args["cxx"].extend(profile_flags)
extra_compile_args["nvcc"].extend(profile_flags)
extra_compile_args["cuda"].extend(profile_flags)
extra_link_args.extend(profile_flags)

if uses_cuda:
Expand All @@ -142,8 +147,11 @@ def get_gt_pyext_build_opts(
if uses_cuda:
cuda_flags = []
for cpp_flag in cpp_flags:
cuda_flags.extend(["--compiler-options", cpp_flag])
build_opts["extra_compile_args"]["nvcc"].extend(cuda_flags)
if gt_config.GT4PY_USE_HIP:
cuda_flags.extend([cpp_flag])
else:
cuda_flags.extend(["--compiler-options", cpp_flag])
build_opts["extra_compile_args"]["cuda"].extend(cuda_flags)
elif cpp_flags:
build_opts["extra_compile_args"].extend(cpp_flags)

Expand Down Expand Up @@ -297,7 +305,10 @@ def build_pybind_cuda_ext(
library_dirs = library_dirs or []
library_dirs = [*library_dirs, gt_config.build_settings["cuda_library_path"]]
libraries = libraries or []
libraries = [*libraries, "cudart"]
if gt_config.GT4PY_USE_HIP:
libraries = [*libraries, "hiprtc"]
else:
libraries = [*libraries, "cudart"]
extra_compile_args = extra_compile_args or []

return build_pybind_ext(
Expand Down Expand Up @@ -342,16 +353,19 @@ def build_extensions(self) -> None:
# Save references to the original methods
original_compile = self.compiler._compile

def nvcc_compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
def cuda_compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
original_compiler_so = self.compiler.compiler_so
# Copy before we make any modifications.
cflags = copy.deepcopy(extra_postargs)
try:
if os.path.splitext(src)[-1] == ".cu":
nvcc_exec = os.path.join(gt_config.build_settings["cuda_bin_path"], "nvcc")
self.compiler.set_executable("compiler_so", [nvcc_exec])
if gt_config.GT4PY_USE_HIP:
cuda_exec = os.path.join(gt_config.build_settings["cuda_bin_path"], "hipcc")
else:
cuda_exec = os.path.join(gt_config.build_settings["cuda_bin_path"], "nvcc")
self.compiler.set_executable("compiler_so", [cuda_exec])
if isinstance(cflags, dict):
cflags = cflags["nvcc"]
cflags = cflags["cuda"]
elif isinstance(cflags, dict):
cflags = cflags["cxx"]

Expand All @@ -360,6 +374,6 @@ def nvcc_compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
# Put the original compiler back in place.
self.compiler.set_executable("compiler_so", original_compiler_so)

self.compiler._compile = nvcc_compile
self.compiler._compile = cuda_compile
build_ext.build_extensions(self)
self.compiler._compile = original_compile
22 changes: 18 additions & 4 deletions src/gt4py/cartesian/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,17 @@

CUDA_HOST_CXX: Optional[str] = os.environ.get("CUDA_HOST_CXX", None)

if "GT4PY_USE_HIP" in os.environ:
GT4PY_USE_HIP: bool = bool(int(os.environ["GT4PY_USE_HIP"]))
else:
# Autodetect cupy with ROCm/HIP support
try:
import cupy as _cp

GT4PY_USE_HIP = _cp.cuda.get_hipcc_path() is not None
del _cp
except Exception:
GT4PY_USE_HIP = False

GT_INCLUDE_PATH: str = os.path.abspath(gridtools_cpp.get_include_dir())

Expand All @@ -42,28 +53,31 @@
"boost_include_path": os.path.join(BOOST_ROOT, "include"),
"cuda_bin_path": os.path.join(CUDA_ROOT, "bin"),
"cuda_include_path": os.path.join(CUDA_ROOT, "include"),
"cuda_library_path": os.path.join(CUDA_ROOT, "lib64"),
"cuda_arch": os.environ.get("CUDA_ARCH", None),
"gt_include_path": os.environ.get("GT_INCLUDE_PATH", GT_INCLUDE_PATH),
"openmp_cppflags": os.environ.get("OPENMP_CPPFLAGS", "-fopenmp").split(),
"openmp_ldflags": os.environ.get("OPENMP_LDFLAGS", "-fopenmp").split(),
"extra_compile_args": {
"cxx": [],
"nvcc": [],
"cuda": [],
},
"extra_link_args": [],
"parallel_jobs": multiprocessing.cpu_count(),
"cpp_template_depth": os.environ.get("GT_CPP_TEMPLATE_DEPTH", GT_CPP_TEMPLATE_DEPTH),
}
if GT4PY_USE_HIP:
build_settings["cuda_library_path"] = os.path.join(CUDA_ROOT, "lib")
else:
build_settings["cuda_library_path"] = os.path.join(CUDA_ROOT, "lib64")

if CUDA_HOST_CXX is not None:
build_settings["extra_compile_args"]["nvcc"].append(f"-ccbin={CUDA_HOST_CXX}")
build_settings["extra_compile_args"]["cuda"].append(f"-ccbin={CUDA_HOST_CXX}")

cache_settings: Dict[str, Any] = {
"dir_name": os.environ.get("GT_CACHE_DIR_NAME", ".gt_cache"),
"root_path": os.environ.get("GT_CACHE_ROOT", os.path.abspath(".")),
"load_retries": int(os.environ.get("GT_CACHE_LOAD_RETRIES", 3)),
"load_retry_delay": int(os.environ.get("GT_CACHE_LOAD_RETRY_DELAY", 100)), # unit miliseconds
"load_retry_delay": int(os.environ.get("GT_CACHE_LOAD_RETRY_DELAY", 100)), # unit milliseconds
}

code_settings: Dict[str, Any] = {"root_package_name": "_GT_"}
Expand Down
14 changes: 14 additions & 0 deletions src/gt4py/storage/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
import numpy as np
import numpy.typing as npt

from gt4py.cartesian import config as gt_config


if np.lib.NumpyVersion(np.__version__) >= "1.20.0":
from numpy.typing import DTypeLike
Expand Down Expand Up @@ -243,6 +245,18 @@ def allocate_gpu(
if device_field.ndim > 0:
device_field = device_field[tuple(slice(0, s, None) for s in shape)]

if gt_config.GT4PY_USE_HIP:
# HIP/ROCm lack support for __cuda_array_interface__
device_field.__hip_array_interface__ = {
Copy link
Contributor

@egparedes egparedes Jun 29, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't this be called __cuda_array_interface__ to avoid any changes in the gridtools bindings?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree, unless things have changed, it seems that the community just always uses the cuda_array_interface name even for AMD device buffers. (e.g. most importantly cupy)

Copy link
Contributor Author

@stubbiali stubbiali Jun 30, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I fully agree that __cuda_array_interface__ should be the way to go, but that's not feasible at the moment. CuPy does not allow to either read or overload __cuda_array_interface__. See this issue and links therein: cupy/cupy#7431. This said, my solution is clearly a hack. DLPack seems the solution the community is converging to, but I must said I'm not very familiar with it, so I need to dig into it first.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My understanding of cupy/cupy#7431 is that this is some particular case where you would want to wrap cpp side allocated data in a cupy buffer. Have you tried and ran into the error? Last time we ran AMD a while back we used __cuda_array_interface__ happily to pass cupy arrays to stencils.

Copy link
Contributor Author

@stubbiali stubbiali Jun 30, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I tried and the GridTools bindings threw the exception AttributeError: HIP/ROCm does not support cuda array interface. Same happens if you try to read __cuda_array_interface__ from within Python. But I cannot exclude I'm missing something important.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 for DLPack.
I've seen accelerated adoption lately. Is it on the radar @egparedes to add it ?
Feels not urgent, but it's something we could contribute later down the road

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this error is there, is it safe to just work around it in this way? There must be a reason why it is not supported.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again, I'm not an expert here, but it's very likely that this workaround only works in simple scenarios, as ours (e.g. using a single stream).

Copy link
Contributor

@egparedes egparedes Jul 4, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 for DLPack. I've seen accelerated adoption lately. Is it on the radar @egparedes to add it ? Feels not urgent, but it's something we could contribute later down the road

@FlorianDeconinck Yes, we're looking into supporting DLPack. There is no timeline yet, but hopefully not far in the future.

"shape": padded_shape,
"typestr": device_field.dtype.descr[0][1],
"descr": device_field.dtype.descr,
"stream": 1,
"version": 3,
"strides": strides,
"data": (device_field.data.ptr, False),
}

return device_raw_buffer, device_field


Expand Down