Skip to content

Commit

Permalink
Fix the E2E test for the TFJob simple prototype test.
Browse files Browse the repository at this point in the history
* Fix kubeflow#1426

There are two problems with the test

  1. Test isn't properly reporting results to gubernator; so test failures
     aren't being noticed.
  2. Test needs to be updated to work with v1alpha2.

* The TestSuite name needs to be set because this is used as the name
  of the junit XML file.

* simple-prototype-test should set test_dir and artifacts_dir.

* Fix the test; use tf_job_client to wait for the job to be in the Running
  condition. This should be more reliable than checking for actual pods.

* The test has probably been broken for a while but this went unnoticed
  because results weren't being properly surfaced in test grid because
  the XML file is improperly named. I suspect things broke as part of
  the switch to v1alpha2 which changed the names of the pods.
  • Loading branch information
jlewi committed Aug 25, 2018
1 parent 106e267 commit 99a786a
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 30 deletions.
51 changes: 21 additions & 30 deletions testing/tf_job_simple_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@
import os
import re
import subprocess

from kubernetes import client as k8s_client
from kubeflow.testing import test_helper, util
from py import tf_job_client
from retrying import retry

NAMESPACE = "default"
Expand All @@ -45,34 +48,11 @@ def parse_args():
args, _ = parser.parse_known_args()
return args

@retry(wait_fixed=5000, stop_max_attempt_number=20)
def wait_for_tf_job():
"""Ensure pods enter running state."""
# For debugging purposes list all pods and their labels.
# This makes it easy to see if the problem is that we specified
# the wrong label selector.
util.run(["kubectl", "--namespace=" + NAMESPACE,
"get", "pods", "-o",
("custom-columns=name:metadata.name,"
"labels:.metadata.labels,status:status.phase")])
out = util.run(["kubectl", "get", "pods", "-l",
"tf_job_name=mycnnjob", "-n" + NAMESPACE])
if "No resources found" in out \
or out.count('Running') != 2:
raise Exception("Could not find pods with label tf_job_name=mycnnjob")
logging.info("Found pods with label tf_job_name=mycnnjob")
out = util.run(["kubectl", "get", "services", "-l",
"tf_job_name=mycnnjob", "-ndefault"])
if "No resources found" in out \
or len(out.split("\n")) != 3:
raise Exception("Could not find services with label tf_job_name=mycnnjob")
logging.info("Found services with label tf_job_name=mycnnjob")

@retry(stop_max_attempt_number=3)
def test_tf_job_simple(test_case): # pylint: disable=redefined-outer-name
args = parse_args()
def create_app_and_job(args, namespace, name):
try:
util.run(["ks", "init", "tf-job-simple-app", "--skip-default-registries"])
util.run(["ks", "init", "tf-job-simple-app", "--skip-default-registries",
"--namespace=" + namespace])
except subprocess.CalledProcessError as e:
# Keep going if the app already exists. This is a sign the a previous
# attempt failed and we are retrying.
Expand Down Expand Up @@ -104,18 +84,29 @@ def test_tf_job_simple(test_case): # pylint: disable=redefined-outer-name
raise ValueError("Unrecognized value for tf_job_version: %s" %
args.tf_job_version)

util.run(["ks", "generate", prototype_name, "tf-job-simple"])
util.run(["ks", "generate", prototype_name, name])
util.run(["ks", "apply", "default", "-c", "tf-job-simple"])

def test_tf_job_simple(test_case): # pylint: disable=redefined-outer-name
args = parse_args()
namespace = "default"
name = "tf-job-simple"

util.load_kube_config()
api_client = k8s_client.ApiClient()
create_app_and_job(args, namespace, name)
try:
wait_for_tf_job()
tf_job_client.wait_for_condition(
api_client, namespace, name, ["Running"],
status_callback=tf_job_client.log_status)
logging.info("TFJob launched successfully")
except Exception as e:
logging.error("Test failed waiting for job; %s", e)
test_case.add_failure_info(e.message)


if __name__ == "__main__":
test_case = test_helper.TestCase(
name="test_tf_job_simple", test_func=test_tf_job_simple)
test_suite = test_helper.init(
name="", test_cases=[test_case])
name="test_tf_job_simple", test_cases=[test_case])
test_suite.run()
2 changes: 2 additions & 0 deletions testing/workflows/components/workflows.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,8 @@
"testing.tf_job_simple_test",
"--src_dir=" + tests.srcDir,
"--tf_job_version=v1alpha2",
"--test_dir=" + tests.testDir,
"--artifacts_dir=" + tests.artifactsDir,
],
},

Expand Down

0 comments on commit 99a786a

Please sign in to comment.