Skip to content

Commit

Permalink
Add UCXX support (rapidsai#1983)
Browse files Browse the repository at this point in the history
Add support for [UCXX](https://github.com/rapidsai/ucxx). It is our intention to soon switch from UCX-Py to UCXX and archive the former.

This PR adds support for UCXX on the C++ backend but retains the original UCX implementation for now (based on the UCP layer), moving to UCXX will simplify RAFT code a bit given the UCXX implementation requires fewer lines of boilerplate code.

On the Python front raft-dask tests are added for both UCX-Py (which there weren't any) and UCXX. The UCX-Py tests continue to use the UCX (UCP layer) implementation, whereas the UCXX tests use the UCXX C++ implementation.

When the switch is complete we can remove all previous UCX/UCX-Py code from the RAFT codebase. If for some reason using the UCX (UCP layer) is preferred on the C++ backend instead of the UCXX C++ implementation this is possible, but UCX-Py code will be archived and dropped in favor of the UCXX Python backend.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Akira Naruse (https://github.com/anaruse)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)
  - Divye Gala (https://github.com/divyegala)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: rapidsai#1983
  • Loading branch information
pentschev authored May 7, 2024
1 parent fd64c24 commit 19842a2
Show file tree
Hide file tree
Showing 29 changed files with 735 additions and 157 deletions.
10 changes: 8 additions & 2 deletions ci/build_wheel.sh
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
#!/bin/bash
# Copyright (c) 2023, NVIDIA CORPORATION.
# Copyright (c) 2023-2024, NVIDIA CORPORATION.

set -euo pipefail

package_name=$1
package_dir=$2
underscore_package_name=$(echo "${package_name}" | tr "-" "_")

# Clear out system ucx files to ensure that we're getting ucx from the wheel.
rm -rf /usr/lib64/ucx
rm -rf /usr/lib64/libuc*

source rapids-configure-sccache
source rapids-date-string

Expand Down Expand Up @@ -38,9 +42,11 @@ fi

if [[ ${package_name} == "raft-dask" ]]; then
sed -r -i "s/pylibraft==(.*)\"/pylibraft${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
sed -r -i "s/libucx(.*)\"/libucx${PACKAGE_CUDA_SUFFIX}\1${alpha_spec}\"/g" ${pyproject_file}
sed -r -i "s/ucx-py==(.*)\"/ucx-py${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
sed -r -i "s/rapids-dask-dependency==(.*)\"/rapids-dask-dependency==\1${alpha_spec}\"/g" ${pyproject_file}
sed -r -i "s/dask-cuda==(.*)\"/dask-cuda==\1${alpha_spec}\"/g" ${pyproject_file}
sed -r -i "s/distributed-ucxx==(.*)\"/distributed-ucxx${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
else
sed -r -i "s/rmm(.*)\"/rmm${PACKAGE_CUDA_SUFFIX}\1${alpha_spec}\"/g" ${pyproject_file}
fi
Expand All @@ -56,6 +62,6 @@ cd "${package_dir}"
python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check

mkdir -p final_dist
python -m auditwheel repair -w final_dist dist/*
python -m auditwheel repair -w final_dist --exclude "libucp.so.0" dist/*

RAPIDS_PY_WHEEL_NAME="${underscore_package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 final_dist
9 changes: 8 additions & 1 deletion ci/release/update-version.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ function sed_runner() {
}

sed_runner "s/set(RAPIDS_VERSION .*)/set(RAPIDS_VERSION \"${NEXT_SHORT_TAG}\")/g" cpp/template/cmake/thirdparty/fetch_rapids.cmake
sed_runner 's/'"find_and_configure_ucxx(VERSION .*"'/'"find_and_configure_ucxx(VERSION ${NEXT_UCX_PY_SHORT_TAG_PEP440}"'/g' python/raft-dask/cmake/thirdparty/get_ucxx.cmake
sed_runner 's/'"branch-.*"'/'"branch-${NEXT_UCX_PY_SHORT_TAG_PEP440}"'/g' python/raft-dask/cmake/thirdparty/get_ucxx.cmake

# Centralized version file update
echo "${NEXT_FULL_TAG}" > VERSION
Expand All @@ -50,7 +52,7 @@ DEPENDENCIES=(
rmm-cu11
rmm-cu12
rapids-dask-dependency
# ucx-py is handled separately below
# ucx-py and ucxx are handled separately below
)
for FILE in dependencies.yaml conda/environments/*.yaml; do
for DEP in "${DEPENDENCIES[@]}"; do
Expand All @@ -59,6 +61,10 @@ for FILE in dependencies.yaml conda/environments/*.yaml; do
sed_runner "/-.* ucx-py==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE};
sed_runner "/-.* ucx-py-cu11==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE};
sed_runner "/-.* ucx-py-cu12==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE};
sed_runner "/-.* libucxx==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE};
sed_runner "/-.* distributed-ucxx==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE};
sed_runner "/-.* distributed-ucxx-cu11==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE};
sed_runner "/-.* distributed-ucxx-cu12==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE};
done
for FILE in python/*/pyproject.toml; do
for DEP in "${DEPENDENCIES[@]}"; do
Expand All @@ -68,6 +74,7 @@ for FILE in python/*/pyproject.toml; do
done

sed_runner "/^ucx_py_version:$/ {n;s/.*/ - \"${NEXT_UCX_PY_VERSION}\"/}" conda/recipes/raft-dask/conda_build_config.yaml
sed_runner "/^ucxx_version:$/ {n;s/.*/ - \"${NEXT_UCX_PY_VERSION}\"/}" conda/recipes/raft-dask/conda_build_config.yaml

for FILE in .github/workflows/*.yaml; do
sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
Expand Down
18 changes: 18 additions & 0 deletions ci/test_python.sh
Original file line number Diff line number Diff line change
Expand Up @@ -59,5 +59,23 @@ rapids-logger "pytest raft-dask"
--cov-report=xml:"${RAPIDS_COVERAGE_DIR}/raft-dask-coverage.xml" \
--cov-report=term

rapids-logger "pytest raft-dask (ucx-py only)"
./ci/run_raft_dask_pytests.sh \
--junitxml="${RAPIDS_TESTS_DIR}/junit-raft-dask-ucx.xml" \
--cov-config=../.coveragerc \
--cov=raft_dask \
--cov-report=xml:"${RAPIDS_COVERAGE_DIR}/raft-dask-ucx-coverage.xml" \
--cov-report=term \
--run_ucx

rapids-logger "pytest raft-dask (ucxx only)"
./ci/run_raft_dask_pytests.sh \
--junitxml="${RAPIDS_TESTS_DIR}/junit-raft-dask-ucxx.xml" \
--cov-config=../.coveragerc \
--cov=raft_dask \
--cov-report=xml:"${RAPIDS_COVERAGE_DIR}/raft-dask-ucxx-coverage.xml" \
--cov-report=term \
--run_ucxx

rapids-logger "Test script exiting with value: $EXITCODE"
exit ${EXITCODE}
12 changes: 9 additions & 3 deletions ci/test_wheel_raft_dask.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,13 @@ RAPIDS_PY_WHEEL_NAME="raft_dask_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels
RAPIDS_PY_WHEEL_NAME="pylibraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibraft-dep
python -m pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl

# echo to expand wildcard before adding `[extra]` requires for pip
python -m pip install $(echo ./dist/raft_dask*.whl)[test]
python -m pip install "raft_dask-${RAPIDS_PY_CUDA_SUFFIX}[test]>=0.0.0a0" --find-links dist/

python -m pytest ./python/raft-dask/raft_dask/test
# rapids-logger "pytest raft-dask"
# python -m pytest ./python/raft-dask/raft_dask/test

# rapids-logger "pytest raft-dask (ucx-py only)"
# python -m pytest ./python/raft-dask/raft_dask/test --run_ucx

rapids-logger "pytest raft-dask (ucxx only)"
python -m pytest ./python/raft-dask/raft_dask/test --run_ucxx
2 changes: 2 additions & 0 deletions conda/environments/all_cuda-118_arch-aarch64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ dependencies:
- cxx-compiler
- cython>=3.0.0
- dask-cuda==24.6.*
- distributed-ucxx==0.38.*
- doxygen>=1.8.20
- gcc_linux-aarch64=11.*
- graphviz
Expand All @@ -34,6 +35,7 @@ dependencies:
- libcusolver=11.4.1.48
- libcusparse-dev=11.7.5.86
- libcusparse=11.7.5.86
- libucxx==0.38.*
- nccl>=2.9.9
- ninja
- numba>=0.57
Expand Down
2 changes: 2 additions & 0 deletions conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ dependencies:
- cxx-compiler
- cython>=3.0.0
- dask-cuda==24.6.*
- distributed-ucxx==0.38.*
- doxygen>=1.8.20
- gcc_linux-64=11.*
- graphviz
Expand All @@ -34,6 +35,7 @@ dependencies:
- libcusolver=11.4.1.48
- libcusparse-dev=11.7.5.86
- libcusparse=11.7.5.86
- libucxx==0.38.*
- nccl>=2.9.9
- ninja
- numba>=0.57
Expand Down
2 changes: 2 additions & 0 deletions conda/environments/all_cuda-122_arch-aarch64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ dependencies:
- cxx-compiler
- cython>=3.0.0
- dask-cuda==24.6.*
- distributed-ucxx==0.38.*
- doxygen>=1.8.20
- gcc_linux-aarch64=11.*
- graphviz
Expand All @@ -31,6 +32,7 @@ dependencies:
- libcurand-dev
- libcusolver-dev
- libcusparse-dev
- libucxx==0.38.*
- nccl>=2.9.9
- ninja
- numba>=0.57
Expand Down
2 changes: 2 additions & 0 deletions conda/environments/all_cuda-122_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ dependencies:
- cxx-compiler
- cython>=3.0.0
- dask-cuda==24.6.*
- distributed-ucxx==0.38.*
- doxygen>=1.8.20
- gcc_linux-64=11.*
- graphviz
Expand All @@ -31,6 +32,7 @@ dependencies:
- libcurand-dev
- libcusolver-dev
- libcusparse-dev
- libucxx==0.38.*
- nccl>=2.9.9
- ninja
- numba>=0.57
Expand Down
1 change: 1 addition & 0 deletions conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ dependencies:
- libcusolver=11.4.1.48
- libcusparse-dev=11.7.5.86
- libcusparse=11.7.5.86
- libucxx==0.38.*
- matplotlib
- nccl>=2.9.9
- ninja
Expand Down
1 change: 1 addition & 0 deletions conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ dependencies:
- libcusolver=11.4.1.48
- libcusparse-dev=11.7.5.86
- libcusparse=11.7.5.86
- libucxx==0.38.*
- matplotlib
- nccl>=2.9.9
- ninja
Expand Down
1 change: 1 addition & 0 deletions conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ dependencies:
- libcurand-dev
- libcusolver-dev
- libcusparse-dev
- libucxx==0.38.*
- matplotlib
- nccl>=2.9.9
- ninja
Expand Down
1 change: 1 addition & 0 deletions conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ dependencies:
- libcurand-dev
- libcusolver-dev
- libcusparse-dev
- libucxx==0.38.*
- matplotlib
- nccl>=2.9.9
- ninja
Expand Down
6 changes: 3 additions & 3 deletions conda/recipes/raft-dask/conda_build_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@ c_stdlib:
c_stdlib_version:
- "2.17"

ucx_version:
- ">=1.15.0,<1.16.0"

ucx_py_version:
- "0.38.*"

ucxx_version:
- "0.38.*"

cmake_version:
- ">=3.26.4"
7 changes: 3 additions & 4 deletions conda/recipes/raft-dask/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,9 @@ requirements:
- rmm ={{ minor_version }}
- scikit-build-core >=0.7.0
- setuptools
- ucx {{ ucx_version }}
- ucx-proc=*=gpu
- ucx-py {{ ucx_py_version }}
- libucxx {{ ucxx_version }}
- ucxx {{ ucxx_version }}
run:
{% if cuda_major == "11" %}
- cudatoolkit
Expand All @@ -73,9 +73,8 @@ requirements:
- pylibraft {{ version }}
- python x.x
- rmm ={{ minor_version }}
- ucx {{ ucx_version }}
- ucx-proc=*=gpu
- ucx-py {{ ucx_py_version }}
- ucxx {{ ucxx_version }}

tests:
requirements:
Expand Down
15 changes: 12 additions & 3 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -650,12 +650,21 @@ rapids_find_generate_module(
INSTALL_EXPORT_SET raft-distributed-exports
)

rapids_export_package(BUILD ucx raft-distributed-exports)
rapids_export_package(INSTALL ucx raft-distributed-exports)
rapids_export_package(
BUILD ucxx raft-distributed-exports COMPONENTS ucxx python GLOBAL_TARGETS ucxx::ucxx ucxx::python
)
rapids_export_package(
INSTALL ucxx raft-distributed-exports COMPONENTS ucxx python GLOBAL_TARGETS ucxx::ucxx
ucxx::python
)
rapids_export_package(BUILD NCCL raft-distributed-exports)
rapids_export_package(INSTALL NCCL raft-distributed-exports)

target_link_libraries(raft_distributed INTERFACE ucx::ucp NCCL::NCCL)
# ucx is a requirement for raft_distributed, but its config is not safe to be found multiple times,
# so rather than exporting a package dependency on it above we rely on consumers to find it
# themselves. Once https://github.com/rapidsai/ucxx/issues/173 is resolved we can export it above
# again.
target_link_libraries(raft_distributed INTERFACE ucx::ucp ucxx::ucxx NCCL::NCCL)

# ##################################################################################################
# * install targets-----------------------------------------------------------
Expand Down
Loading

0 comments on commit 19842a2

Please sign in to comment.