Skip to content

Commit

Permalink
[ROCm] Optimize ROCm CI pipeline 2 (microsoft#16691)
Browse files Browse the repository at this point in the history
- Set `KERNEL_EXPLORER_TEST_USE_CUPY=1` to replace numpy with cupy on
kernel explorer test.

KERNEL_EXPLORER_TEST_USE_CUPY=0 The CPU utilization is shown as below:

![image](https://github.com/microsoft/onnxruntime/assets/94887879/91724b78-0b4e-4cbd-ad88-83cad9976472)

KERNEL_EXPLORER_TEST_USE_CUPY=1 The CPU utilization is shown as below:

![image](https://github.com/microsoft/onnxruntime/assets/94887879/58239911-667c-4d5f-bb78-deca60d0266f)


- Use `Bash@3`.
- Update shell script.
  • Loading branch information
PeixuanZuo committed Jul 24, 2023
1 parent 21ef144 commit 8ede2f1
Show file tree
Hide file tree
Showing 5 changed files with 60 additions and 42 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -133,12 +133,11 @@ jobs:
DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )"
Repository: onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)-test

- task: CmdLine@2
- task: Bash@3
inputs:
script: |-
echo "Select agent: $(Agent.Name), GPU: $HIP_VISIBLE_DEVICES, render: $DRIVER_RENDER"
bash $(Build.SourcesDirectory)/tools/ci_build/github/pai/pai_clean_device.sh $(Agent.Name) $HIP_VISIBLE_DEVICES
workingDirectory: $(Build.SourcesDirectory)
targetType: filePath
filePath: $(Build.SourcesDirectory)/tools/ci_build/github/pai/pai_clean_device.sh
arguments: -n $(Agent.Name) -d $HIP_VISIBLE_DEVICES -r $DRIVER_RENDER
displayName: 'Check ROCm Environment'

- task: CmdLine@2
Expand Down Expand Up @@ -182,6 +181,7 @@ jobs:
set -ex; \
export KERNEL_EXPLORER_BUILD_DIR=/build/$(BuildConfig); \
export KERNEL_EXPLORER_BATCHED_GEMM_MAX_BATCH_SIZE=8; \
export KERNEL_EXPLORER_TEST_USE_CUPY=1; \
pytest /onnxruntime_src/onnxruntime/python/tools/kernel_explorer/ -n 8 --reruns 1 --durations=100"
workingDirectory: $(Build.SourcesDirectory)
displayName: 'Run kernel explorer tests'
Expand All @@ -206,7 +206,7 @@ jobs:
set -ex; \
export PYTHONPATH=/build/$(BuildConfig); \
python -m onnxruntime.training.ortmodule.torch_cpp_extensions.install; \
bash /onnxruntime_src/tools/ci_build/github/pai/pai_huggingface_bert_large_test.sh $(RocmVersion)"
bash /onnxruntime_src/tools/ci_build/github/pai/pai_huggingface_bert_large_test.sh -v $(RocmVersion)"
workingDirectory: $(Build.SourcesDirectory)
displayName: 'Run Python Hugging-Face BERT-L test'
condition: succeededOrFailed()
Expand Down Expand Up @@ -250,11 +250,12 @@ jobs:
displayName: 'Run orttraining_ortmodule_tests.py'
condition: succeededOrFailed()

- task: CmdLine@2

- task: Bash@3
inputs:
script: |-
bash $(Build.SourcesDirectory)/tools/ci_build/github/pai/pai_clean_device.sh $(Agent.Name) $HIP_VISIBLE_DEVICES
workingDirectory: $(Build.SourcesDirectory)
targetType: filePath
filePath: $(Build.SourcesDirectory)/tools/ci_build/github/pai/pai_clean_device.sh
arguments: -n $(Agent.Name) -d $HIP_VISIBLE_DEVICES -r $DRIVER_RENDER
displayName: 'Clean ROCm Environment'
condition: always()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ ARG MIGRAPHX_VERSION=rocm-5.5.0
ENV DEBIAN_FRONTEND noninteractive
ENV MIGRAPHX_DISABLE_FAST_GELU=1

RUN apt-get clean && apt-get update -y && apt-get upgrade -y && apt-get install -y locales unzip
RUN apt-get update -y && apt-get upgrade -y && apt-get autoremove -y && \
apt-get install -y locales unzip && apt-get clean -y
RUN locale-gen en_US.UTF-8
RUN update-locale LANG=en_US.UTF-8
ENV LC_ALL C.UTF-8
Expand Down
53 changes: 31 additions & 22 deletions tools/ci_build/github/pai/pai_clean_device.sh
Original file line number Diff line number Diff line change
@@ -1,38 +1,47 @@
#!/bin/bash
set -ex

agentName=$1
target_device=$2
echo "agent name $agentName"
echo "agent target device : $target_device"
usage() { echo "Usage: $0 [-n <agent name>] [-d <target device>] [-r <driver render>]" 1>&2; exit 1; }

while getopts "n:d:r:" parameter_Option
do case "${parameter_Option}"
in
n) AGENT_NAME=${OPTARG};;
d) TARGET_DEVICE=${OPTARG};;
r) DRIVER_RENDER=${OPTARG};;
*) usage ;;
esac
done

echo "Agent Name: $AGENT_NAME, Target Device: $TARGET_DEVICE, Driver Render: $DRIVER_RENDER"

echo -e "\n ---- rocm-smi"
echo -e "\n ---- Execute rocm-smi"
rocm-smi

echo -e "\n ---- rocm-smi --showpids"
echo -e "\n ---- Execute rocm-smi --showpids"
rocm-smi --showpids

echo -e "\n ---- rocm-smi --showpidgpus"
echo -e "\n ---- Execute rocm-smi --showpidgpus"
rocm-smi --showpidgpus

echo -e "\n ---- rocm-smi --showpids detail"
echo -e "\n ---- Execute rocm-smi --showpids detail"
rocm-smi --showpids | awk '$1 ~/[0-9]+/{if((NR>6)) {print $1}}' | xargs -I {} ps {}

echo -e "\n ---- rocm-smi --showmeminfo"
echo -e "\n ---- Execute rocm-smi --showmeminfo"
rocm-smi --showmeminfo vram vis_vram gtt

echo -e "\n ---- Clean up the process that is using the target device"
gpu_details=$(rocm-smi --showpidgpus)
pid_lines=$(echo "$gpu_details" | grep -n "DRM device" | cut -d ":" -f 1)
pid_lines_array=($pid_lines)

for ((i = 0; i < ${#pid_lines_array[@]}; i++)); do
pid_line=${pid_lines_array[$i]}
pid=$(echo "$gpu_details" | awk '{print $2}' | sed -n "${pid_line}p")
gpu_line=$((pid_line + 1))
pid_gpu=$(echo "$gpu_details" | sed -n "${gpu_line}p" | sed -e 's/^[ ]*//g' | sed -e 's/[ ]*$//g')
if [ "$pid_gpu" == "$target_device" ]; then
echo "kill pid: $pid, gpu: $pid_gpu"
kill -9 $pid
echo -e "\n ---- Clean up processes that use the target device $TARGET_DEVICE"
GPU_USED_BY_PIDS=$(rocm-smi --showpidgpus)
PID_NUMBERS_LINES=$(echo "$GPU_USED_BY_PIDS" | grep -n "DRM device" | cut -d ":" -f 1)
PID_NUMBERS_LINES_ARRAY=($PID_NUMBERS_LINES)

for ((i = 0; i < ${#PID_NUMBERS_LINES_ARRAY[@]}; i++)); do
PID_NUMBER_LINE=${PID_NUMBERS_LINES_ARRAY[$i]}
PID_NUMBER=$(echo "$GPU_USED_BY_PIDS" | awk '{print $2}' | sed -n "${PID_NUMBER_LINE}p")
GPU_USED_BY_PID_LINE=$((PID_NUMBER_LINE + 1))
GPU_USED_BY_PID=$(echo "$GPU_USED_BY_PIDS" | sed -n "${GPU_USED_BY_PID_LINE}p" | sed -e 's/^[ ]*//g' | sed -e 's/[ ]*$//g')
if [ "$GPU_USED_BY_PID" == "$TARGET_DEVICE" ]; then
echo "kill pid: $PID_NUMBER, using gpu: $GPU_USED_BY_PID"
kill -9 "$PID_NUMBER"
fi
done
21 changes: 14 additions & 7 deletions tools/ci_build/github/pai/pai_huggingface_bert_large_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,22 @@

set -ex

rocm_version=$1
mi200_gpus=$(rocm-smi --showproductname | grep -c "MI250" | xargs)
usage() { echo "Usage: $0 [-v <ROCm version>]" 1>&2; exit 1; }

echo "mi200_gpus: $mi200_gpus"
while getopts "v:" parameter_Option
do case "${parameter_Option}"
in
v) ROCM_VERSION=${OPTARG};;
*) usage ;;
esac
done

if [ "$mi200_gpus" -gt "0" ]; then
result_file=ci-mi200.huggingface.bert-large-rocm${rocm_version}.json
MI200_DEVICE_NUMBERS=$(rocm-smi --showproductname | grep -c "MI250" | xargs)

if [ "$MI200_DEVICE_NUMBERS" -gt "0" ]; then
RESULT_FILE=ci-mi200.huggingface.bert-large-rocm${ROCM_VERSION}.json
else
result_file=ci-mi100.huggingface.bert-large-rocm${rocm_version}.json
RESULT_FILE=ci-mi100.huggingface.bert-large-rocm${ROCM_VERSION}.json
fi

python \
Expand All @@ -33,4 +40,4 @@ cat ci-pipeline-actual.json

python /onnxruntime_src/orttraining/tools/ci_test/compare_huggingface.py \
ci-pipeline-actual.json \
/onnxruntime_src/orttraining/tools/ci_test/results/${result_file}
/onnxruntime_src/orttraining/tools/ci_test/results/"$RESULT_FILE"
4 changes: 2 additions & 2 deletions tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
FROM rocm/pytorch:rocm5.5_ubuntu20.04_py3.8_pytorch_1.13.1
FROM rocm/cupy:rocm5.5.0_ubuntu20.04_py3.8_pytorch2.0.0_cupy13.0.0

RUN apt-get update -y && apt-get upgrade -y
RUN apt-get update -y && apt-get upgrade -y && apt-get autoremove -y && apt-get clean -y

WORKDIR /stage

Expand Down

0 comments on commit 8ede2f1

Please sign in to comment.