-
Notifications
You must be signed in to change notification settings - Fork 3.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Github Action to run TPU tests. (#2376)
* Add Github Action to run TPU tests. * Trigger new Github Actions run. * Clean up more comments. * Use different fixed version of ml-testing-accelerators and update config to match. * use cluster in us-central1-a * Run 'gcloud logging read' directly without 'echo' to preserve newlines. * cat coverage.xml on the TPU VM side and upload xml on the Github Action side * Use new commit on ml-testing-accelerators so command runs fully. * Preserve newlines in the xml and use if: always() temporarily to upload codecov * Use pytorch_lightning for coverage instead of pytorch-lightning * Remove the debug cat of coverage xml * Apply suggestions from code review * jsonnet rename * name * add codecov flags * add codecov flags * codecov * codecov * revert codecov * Clean up after apt-get and remove old TODOs. * More codefactor cleanups. * drone * drone * disable codecov * cleaning * docker py versions * docker py 3.7 * readme * bash * docker * freeze conda * py3.6 * Stop using apt-get clean. * Dont rm pytorch-lightning * Update docker/tpu/Dockerfile * Longer timeout in the Github Action to wait for GKE to finish. * job1 * job2 * job3 Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: Jirka <jirka@pytorchlightning.ai>
- Loading branch information
1 parent
dcd6000
commit 1a40963
Showing
7 changed files
with
230 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
name: TPU tests | ||
|
||
on: | ||
push: | ||
branches: | ||
- master | ||
pull_request: | ||
branches: | ||
- master | ||
|
||
env: | ||
PROJECT_ID: ${{ secrets.GKE_PROJECT }} | ||
GKE_CLUSTER: lightning-cluster | ||
GKE_ZONE: us-central1-a | ||
IMAGE: gcr.io/${{ secrets.GKE_PROJECT }}/tpu-testing-image | ||
|
||
jobs: | ||
setup-build-publish-deploy: | ||
name: tpu-testing-job | ||
runs-on: ubuntu-latest | ||
|
||
steps: | ||
- name: Install Go | ||
uses: actions/setup-go@v2 | ||
with: | ||
go-version: 1.14.x | ||
|
||
- name: Checkout Pytorch Lightning | ||
uses: actions/checkout@v2 | ||
with: | ||
repository: PyTorchLightning/pytorch-lightning | ||
ref: ${{ github.event.pull_request.head.sha }} | ||
path: main | ||
|
||
- name: Checkout ml-testing-accelerators | ||
uses: actions/checkout@v2 | ||
with: | ||
repository: GoogleCloudPlatform/ml-testing-accelerators | ||
path: ml-testing-accelerators | ||
ref: 5e88ac24f631c27045e62f0e8d5dfcf34e425e25 | ||
|
||
- name: Setup gcloud CLI | ||
uses: GoogleCloudPlatform/github-actions/setup-gcloud@master | ||
with: | ||
version: '290.0.1' | ||
service_account_key: ${{ secrets.GKE_SA_KEY_BASE64 }} | ||
project_id: ${{ secrets.GKE_PROJECT }} | ||
export_default_credentials: true | ||
|
||
# Configure Docker to use the gcloud command-line tool as a credential helper for authentication. | ||
- name: Configure Docker | ||
run: |- | ||
gcloud --quiet auth configure-docker | ||
shell: bash | ||
- name: Build and Push Docker Image | ||
run: | | ||
cd main/docker/tpu | ||
docker build --tag "$IMAGE:$GITHUB_RUN_ID" -f Dockerfile --build-arg "GITHUB_REF=$GITHUB_REF" --build-arg "TEST_IMAGE=1" . | ||
docker push "$IMAGE:$GITHUB_RUN_ID" | ||
shell: bash | ||
|
||
- name: Install jsonnet | ||
run: |- | ||
go get github.com/google/go-jsonnet/cmd/jsonnet | ||
shell: bash | ||
# Get the GKE credentials so we can deploy to the cluster | ||
# Use either zone or region depending on cluster setup. | ||
- run: |- | ||
gcloud container clusters get-credentials "$GKE_CLUSTER" --zone "$GKE_ZONE" | ||
shell: bash | ||
- name: Deploy the job on the kubernetes cluster | ||
run: |- | ||
job_name=$(jsonnet -J ml-testing-accelerators/ main/docker/tpu/tpu_test_cases.jsonnet --ext-str image=$IMAGE --ext-str image-tag=$GITHUB_RUN_ID | kubectl create -f -) && \ | ||
job_name=${job_name#job.batch/} && \ | ||
job_name=${job_name% created} && \ | ||
echo "Waiting on kubernetes job: $job_name in cluster: $GKE_CLUSTER" && \ | ||
i=0 && \ | ||
# 30 checks spaced 30s apart = 900s total. | ||
max_checks=30 && \ | ||
status_code=2 && \ | ||
# Check on the job periodically. Set the status code depending on what | ||
# happened to the job in Kubernetes. If we try max_checks times and | ||
# still the job hasn't finished, give up and return the starting | ||
# non-zero status code. | ||
while [ $i -lt $max_checks ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else echo "Job not finished yet"; fi; sleep 30; done && \ | ||
echo "Done waiting. Job status code: $status_code" && \ | ||
# Allow time for logs to flush. | ||
sleep 60 && \ | ||
echo "JOB_NAME: $job_name" && \ | ||
echo "GKE_CLUSTER: $GKE_CLUSTER" && \ | ||
echo "GKE_ZONE: $GKE_ZONE" && \ | ||
gcloud logging read "resource.type=k8s_container resource.labels.project_id=$PROJECT_ID resource.labels.location=$GKE_ZONE resource.labels.cluster_name=$GKE_CLUSTER resource.labels.namespace_name=default resource.labels.pod_name:$job_name" --limit 10000000 --order asc --format 'value(textPayload)' --project=$PROJECT_ID > /tmp/full_output.txt && \ | ||
csplit /tmp/full_output.txt '/<?xml version="1.0" ?>/' && \ | ||
# First portion is the test logs. Print these to Github Action stdout. | ||
cat xx00 && \ | ||
echo "Done with log retrieval attempt." && \ | ||
gcloud container images delete "$IMAGE:$GITHUB_RUN_ID" --force-delete-tags && \ | ||
exit $status_code | ||
shell: bash | ||
|
||
# todo: to be used after enable merging reports from different CIs | ||
#- name: Upload coverage to Codecov | ||
# uses: codecov/codecov-action@v1 | ||
# if: always() | ||
# with: | ||
# token: ${{ secrets.CODECOV_TOKEN }} | ||
# file: ./xx01 | ||
# flags: tpu,pytest | ||
# # env_vars: OS,PYTHON | ||
# # name: codecov-umbrella | ||
# fail_ci_if_error: true | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
FROM google/cloud-sdk:slim | ||
|
||
# Build args. | ||
ARG GITHUB_REF=refs/heads/master | ||
ARG TEST_IMAGE=0 | ||
|
||
# This Dockerfile installs pytorch/xla 3.7 wheels. There are also 3.6 | ||
# wheels available; see below. | ||
ENV PYTHON_VERSION=3.6 | ||
|
||
RUN apt-get update && apt-get install -y --no-install-recommends \ | ||
build-essential \ | ||
cmake \ | ||
git \ | ||
curl \ | ||
ca-certificates | ||
|
||
# Install conda and python. | ||
# NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385 | ||
RUN curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-4.7.12-Linux-x86_64.sh && \ | ||
chmod +x ~/miniconda.sh && \ | ||
~/miniconda.sh -b && \ | ||
rm ~/miniconda.sh | ||
|
||
ENV PATH=/root/miniconda3/bin:$PATH | ||
|
||
RUN conda create -y --name container python=$PYTHON_VERSION | ||
|
||
# Run the rest of commands within the new conda env. | ||
# Use absolute path to appease Codefactor. | ||
SHELL ["/root/miniconda3/bin/conda", "run", "-n", "container", "/bin/bash", "-c"] | ||
RUN conda install -y python=$PYTHON_VERSION mkl | ||
|
||
RUN pip uninstall -y torch && \ | ||
# Python 3.7 wheels are available. Replace cp36-cp36m with cp37-cp37m | ||
gsutil cp 'gs://tpu-pytorch/wheels/torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \ | ||
gsutil cp 'gs://tpu-pytorch/wheels/torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \ | ||
gsutil cp 'gs://tpu-pytorch/wheels/torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \ | ||
pip install 'torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \ | ||
pip install 'torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \ | ||
pip install 'torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \ | ||
rm 'torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \ | ||
rm 'torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \ | ||
rm 'torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \ | ||
apt-get install -y libomp5 | ||
|
||
ENV LD_LIBRARY_PATH=root/miniconda3/envs/container/lib | ||
|
||
# Install pytorch-lightning at the current PR, plus dependencies. | ||
RUN git clone https://github.com/PyTorchLightning/pytorch-lightning.git && \ | ||
cd pytorch-lightning && \ | ||
git fetch origin $GITHUB_REF:CI && \ | ||
git checkout CI && \ | ||
cd .. && \ | ||
pip install ./pytorch-lightning | ||
|
||
# If using this image for tests, intall more dependencies and don't delete | ||
# the source code where the tests live. | ||
RUN if [ $TEST_IMAGE -eq 1 ] ; then \ | ||
pip install -r pytorch-lightning/requirements/test.txt ; \ | ||
else \ | ||
rm -rf pytorch-lightning ; \ | ||
fi | ||
|
||
RUN conda init bash | ||
RUN python -c "import pytorch_lightning as pl; print(pl.__version__)" | ||
|
||
COPY docker-entrypoint.sh /usr/local/bin/ | ||
RUN chmod +x /usr/local/bin/docker-entrypoint.sh | ||
|
||
ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"] | ||
CMD ["bash"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
#!/bin/bash | ||
source ~/.bashrc | ||
echo "running docker-entrypoint.sh" | ||
conda activate container | ||
echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS | ||
echo "printed TPU info" | ||
export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}" | ||
exec "$@" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
local base = import 'templates/base.libsonnet'; | ||
local tpus = import 'templates/tpus.libsonnet'; | ||
local utils = import "templates/utils.libsonnet"; | ||
|
||
local tputests = base.BaseTest { | ||
frameworkPrefix: 'pl', | ||
modelName: 'tpu-tests', | ||
mode: 'postsubmit', | ||
configMaps: [], | ||
|
||
timeout: 900, # 15 minutes, in seconds. | ||
|
||
image: std.extVar('image'), | ||
imageTag: std.extVar('image-tag'), | ||
|
||
tpuSettings+: { | ||
softwareVersion: 'pytorch-nightly', | ||
}, | ||
accelerator: tpus.v3_8, | ||
|
||
command: utils.scriptCommand( | ||
||| | ||
coverage run --source=pytorch_lightning -m pytest pytorch-lightning/tests/models/test_tpu.py -v | ||
test_exit_code=$? | ||
echo "\n||| END PYTEST LOGS |||\n" | ||
coverage xml | ||
cat coverage.xml | tr -d '\t' | ||
test $test_exit_code -eq 0 | ||
||| | ||
), | ||
}; | ||
|
||
tputests.oneshotJob |