Merge pull request #2130 from recommenders-team/miguel/test_readme

Review test readme and define best practices
recommenders-team · Jul 18, 2024 · 77c6fe5 · 77c6fe5
2 parents 3672c2e + a441cb4
commit 77c6fe5
Show file tree

Hide file tree

Showing 19 changed files with 74 additions and 68 deletions.
diff --git a/tests/README.md b/tests/README.md
@@ -5,6 +5,12 @@ Licensed under the MIT License.
 
 # Tests
 
+Recommenders test pipeline is one of the most sophisticated MLOps pipelines in the open-source community. We execute tests in the three environments we support: CPU, GPU, and Spark, mirroring the tests in each Python version we support. We not only tests the library, but also the Jupyter notebooks in the examples folder.
+
+The reason to have this extensive test infrastructure is to ensure that the code is reproducible by the community and that we can maintain the project with a small number of core contributors.
+
+We currently execute over a thousand tests in the project, and we are always looking for ways to improve the test coverage. To get the exact number of tests, you can run `pytest tests --collect-only`, and then multiply the number of tests by the number of Python versions we support.
+
 In this document we show our test infrastructure and how to contribute tests to the repository.
 
 ## Table of Contents
@@ -74,15 +80,10 @@ In this section we show how to create tests and add them to the test pipeline. T
 
 ### How to create tests for the Recommenders library
 
-You want to make sure that all your code works before you submit it to the repository. Here are some guidelines for creating the unit tests:
+You want to make sure that all your code works before you submit it to the repository. Here are some guidelines for creating the tests:
 
 * It is better to create multiple small tests than one large test that checks all the code.
 * Use `@pytest.fixture` to create data in your tests.
-* Use the mark `@pytest.mark.gpu` if you want the test to be executed
-  in a GPU environment. Use `@pytest.mark.spark` if you want the test
-  to be executed in a Spark environment.
-* Use `@pytest.mark.notebooks` if you are testing a notebook.
-* Avoid using `is` in the asserts, instead use the operator `==`.
 * Follow the pattern `assert computation == value`, for example:
 ```python
 assert results["precision"] == pytest.approx(0.330753)
@@ -92,6 +93,11 @@ assert results["precision"] == pytest.approx(0.330753)
 assert rmse(rating_true, rating_true) == 0
 assert rmse(rating_true, rating_pred) == pytest.approx(7.254309)
 ```
+* Use the operator `==` with values. Use the operator `is` in singletons like `None`, `True` or `False`.
+* Make explicit asserts. In other words, make sure you assert to something (`assert computation == value`) and not just `assert computation`.
+* Use the mark `@pytest.mark.gpu` if you want the test to be executed in a GPU environment. Use `@pytest.mark.spark` if you want the test to be executed in a Spark environment.
+* Use `@pytest.mark.notebooks` if you are testing a notebook.
+
 
 ### How to create tests for the notebooks
 

diff --git a/tests/ci/azureml_tests/submit_groupwise_azureml_pytest.py b/tests/ci/azureml_tests/submit_groupwise_azureml_pytest.py
@@ -324,7 +324,7 @@ def create_arg_parser():
     parser.add_argument(
         "--test",
         action="store",
-        default="./tests/ci/azureml_tests/run_groupwise_pytest.py",
+        default="tests/ci/azureml_tests/run_groupwise_pytest.py",
         help="location of script to run pytest",
     )
     # max num nodes in Azure cluster

diff --git a/tests/ci/azureml_tests/test_groups.py b/tests/ci/azureml_tests/test_groups.py
@@ -63,24 +63,24 @@
     ],
     "group_gpu_001": [  # Total group time: 1937.01s
         "tests/unit/examples/test_notebooks_gpu.py::test_gpu_vm",  # 0.76s (Always the first test to check the GPU works)
-        "tests/smoke/recommenders/recommender/test_deeprec_utils.py",  # 2.91
-        "tests/smoke/recommenders/recommender/test_deeprec_model.py::test_FFM_iterator",  # 0.74s
-        "tests/smoke/recommenders/recommender/test_newsrec_utils.py::test_news_iterator",  # 3.04s
+        "tests/smoke/recommenders/models/test_deeprec_utils.py",  # 2.91
+        "tests/smoke/recommenders/models/test_deeprec_model.py::test_FFM_iterator",  # 0.74s
+        "tests/smoke/recommenders/models/test_newsrec_utils.py::test_news_iterator",  # 3.04s
         #
-        "tests/smoke/recommenders/recommender/test_deeprec_model.py::test_model_lightgcn",  # 6.03s
+        "tests/smoke/recommenders/models/test_deeprec_model.py::test_model_lightgcn",  # 6.03s
         "tests/functional/examples/test_notebooks_gpu.py::test_lightgcn_deep_dive_functional",  # 19.45s
         #
-        # "tests/smoke/recommenders/recommender/test_deeprec_model.py::test_model_sum",  # 27.23s  # FIXME: Disabled due to the issue with TF version > 2.10.1 See #2018
+        # "tests/smoke/recommenders/models/test_deeprec_model.py::test_model_sum",  # 27.23s  # FIXME: Disabled due to the issue with TF version > 2.10.1 See #2018
         #
-        "tests/smoke/recommenders/recommender/test_deeprec_model.py::test_model_dkn",  # 187.20s
+        "tests/smoke/recommenders/models/test_deeprec_model.py::test_model_dkn",  # 187.20s
         "tests/functional/examples/test_notebooks_gpu.py::test_dkn_quickstart_functional",  # 1167.93s
         #
         "tests/functional/examples/test_notebooks_gpu.py::test_slirec_quickstart_functional",  # 175.00s
-        "tests/smoke/recommenders/recommender/test_deeprec_model.py::test_model_slirec",  # 346.72s
+        "tests/smoke/recommenders/models/test_deeprec_model.py::test_model_slirec",  # 346.72s
     ],
     "group_gpu_002": [  # Total group time: 1896.76s
         "tests/unit/examples/test_notebooks_gpu.py::test_gpu_vm",  # 0.76s (Always the first test to check the GPU works)
-        "tests/smoke/recommenders/recommender/test_deeprec_model.py::test_model_xdeepfm",  # 3.10s
+        "tests/smoke/recommenders/models/test_deeprec_model.py::test_model_xdeepfm",  # 3.10s
         # FIXME: https://github.com/microsoft/recommenders/issues/1883
         # "tests/smoke/examples/test_notebooks_gpu.py::test_xdeepfm_smoke",  # 77.93s
         "tests/functional/examples/test_notebooks_gpu.py::test_xdeepfm_functional",
@@ -100,9 +100,9 @@
         "tests/smoke/examples/test_notebooks_gpu.py::test_ncf_deep_dive_smoke",  # 102.71s
         "tests/functional/examples/test_notebooks_gpu.py::test_ncf_deep_dive_functional",  # 351.17s
         #
-        "tests/smoke/recommenders/recommender/test_newsrec_utils.py::test_naml_iterator",  # 5.50s
+        "tests/smoke/recommenders/models/test_newsrec_utils.py::test_naml_iterator",  # 5.50s
         # FIXME: https://github.com/microsoft/recommenders/issues/1883
-        # "tests/smoke/recommenders/recommender/test_newsrec_model.py::test_model_naml",  # 450.65s
+        # "tests/smoke/recommenders/models/test_newsrec_model.py::test_model_naml",  # 450.65s
     ],
     "group_gpu_004": [  # Total group time: 2103.34s
         "tests/unit/examples/test_notebooks_gpu.py::test_gpu_vm",  # 0.76s (Always the first test to check the GPU works)
@@ -125,8 +125,8 @@
     ],
     "group_gpu_006": [  # Total group time: 1763.99s
         "tests/unit/examples/test_notebooks_gpu.py::test_gpu_vm",  # 0.76s (Always the first test to check the GPU works)
-        "tests/smoke/recommenders/recommender/test_newsrec_model.py::test_model_npa",  # 202.61s
-        "tests/smoke/recommenders/recommender/test_newsrec_model.py::test_model_nrms",  # 188.60s
+        "tests/smoke/recommenders/models/test_newsrec_model.py::test_model_npa",  # 202.61s
+        "tests/smoke/recommenders/models/test_newsrec_model.py::test_model_nrms",  # 188.60s
     ],
     "group_gpu_007": [  # Total group time: 846.89s
         "tests/unit/examples/test_notebooks_gpu.py::test_gpu_vm",  # 0.76s (Always the first test to check the GPU works)
@@ -138,7 +138,7 @@
         # "tests/functional/examples/test_notebooks_gpu.py::test_naml_quickstart_functional",  # 2033.85s
         # FIXME: https://github.com/microsoft/recommenders/issues/1716
         # "tests/functional/examples/test_notebooks_gpu.py::test_sasrec_quickstart_functional",  # 448.06s + 614.69s
-        "tests/smoke/recommenders/recommender/test_newsrec_model.py::test_model_lstur",  # 194.88s
+        "tests/smoke/recommenders/models/test_newsrec_model.py::test_model_lstur",  # 194.88s
     ],
     "group_spark_001": [  # Total group time: 987.16s
         "tests/data_validation/recommenders/datasets/test_movielens.py::test_load_spark_df",  # 4.33s+ 25.58s + 101.99s + 139.23s

diff --git a/tests/data_validation/examples/test_wikidata.py b/tests/data_validation/examples/test_wikidata.py
@@ -8,7 +8,6 @@
 
 
 @pytest.mark.notebooks
-# @pytest.mark.skip(reason="Wikidata API is unstable")
 def test_wikidata_runs(notebooks, output_notebook, kernel_name, tmp):
     notebook_path = notebooks["wikidata_knowledge_graph"]
     MOVIELENS_SAMPLE_SIZE = 5
@@ -25,7 +24,6 @@ def test_wikidata_runs(notebooks, output_notebook, kernel_name, tmp):
 
 
 @pytest.mark.notebooks
-# @pytest.mark.skip(reason="Wikidata API is unstable")
 def test_wikidata_values(notebooks, output_notebook, kernel_name):
     notebook_path = notebooks["wikidata_knowledge_graph"]
     execute_notebook(

diff --git a/tests/data_validation/recommenders/datasets/test_movielens.py b/tests/data_validation/recommenders/datasets/test_movielens.py
@@ -117,7 +117,7 @@ def test_download_and_extract_movielens(size, tmp):
     zip_path = os.path.join(tmp, "ml.zip")
     download_movielens(size, dest_path=zip_path)
     assert len(os.listdir(tmp)) == 1
-    assert os.path.exists(zip_path)
+    assert os.path.exists(zip_path) is True
 
     rating_path = os.path.join(tmp, "rating.dat")
     item_path = os.path.join(tmp, "item.dat")
@@ -126,8 +126,8 @@ def test_download_and_extract_movielens(size, tmp):
     )
     # Test if raw-zip file, rating file, and item file are cached
     assert len(os.listdir(tmp)) == 3
-    assert os.path.exists(rating_path)
-    assert os.path.exists(item_path)
+    assert os.path.exists(rating_path) is True
+    assert os.path.exists(item_path) is True
 
 
 @pytest.mark.parametrize(

diff --git a/tests/responsible_ai/recommenders/datasets/test_criteo_privacy.py b/tests/responsible_ai/recommenders/datasets/test_criteo_privacy.py
@@ -12,4 +12,4 @@ def test_criteo_privacy(criteo_first_row):
     data is anonymized.
     """
     df = criteo.load_pandas_df(size="sample")
-    assert df.loc[0].equals(pd.Series(criteo_first_row))
+    assert df.loc[0].equals(pd.Series(criteo_first_row)) is True
diff --git a/tests/responsible_ai/recommenders/datasets/test_movielens_privacy.py b/tests/responsible_ai/recommenders/datasets/test_movielens_privacy.py
@@ -11,4 +11,5 @@ def test_movielens_privacy():
     """
     df = movielens.load_pandas_df(size="100k")
     users = df["userID"].values.tolist()
-    assert all(isinstance(x, int) for x in users)
+
+    assert all(isinstance(x, int) for x in users) is True
diff --git a/...moke/recommenders/recommender/__init__.py → tests/smoke/recommenders/models/__init__.py b/...moke/recommenders/recommender/__init__.py → tests/smoke/recommenders/models/__init__.py
diff --git a/...menders/recommender/test_deeprec_model.py → ...recommenders/models/test_deeprec_model.py b/...menders/recommender/test_deeprec_model.py → ...recommenders/models/test_deeprec_model.py
diff --git a/...menders/recommender/test_deeprec_utils.py → ...recommenders/models/test_deeprec_utils.py b/...menders/recommender/test_deeprec_utils.py → ...recommenders/models/test_deeprec_utils.py
diff --git a/...menders/recommender/test_newsrec_model.py → ...recommenders/models/test_newsrec_model.py b/...menders/recommender/test_newsrec_model.py → ...recommenders/models/test_newsrec_model.py
diff --git a/...menders/recommender/test_newsrec_utils.py → ...recommenders/models/test_newsrec_utils.py b/...menders/recommender/test_newsrec_utils.py → ...recommenders/models/test_newsrec_utils.py
diff --git a/tests/unit/recommenders/datasets/test_download_utils.py b/tests/unit/recommenders/datasets/test_download_utils.py
@@ -25,7 +25,7 @@ def test_maybe_download(files_fixtures):
         os.remove(filepath)
 
     downloaded_filepath = maybe_download(file_url, "license.txt", expected_bytes=1212)
-    assert os.path.exists(downloaded_filepath)
+    assert os.path.exists(downloaded_filepath) is True
     assert os.path.basename(downloaded_filepath) == "license.txt"
 
 
@@ -51,7 +51,7 @@ def test_maybe_download_maybe(caplog, files_fixtures):
         os.remove(filepath)
 
     downloaded_filepath = maybe_download(file_url, "license.txt")
-    assert os.path.exists(downloaded_filepath)
+    assert os.path.exists(downloaded_filepath) is True
     maybe_download(file_url, "license.txt")
     assert "File ." + os.path.sep + "license.txt already downloaded" in caplog.text
 
@@ -69,11 +69,11 @@ def test_maybe_download_retry(caplog):
 def test_download_path():
     # Check that the temporal path is created and deleted
     with download_path() as path:
-        assert os.path.isdir(path)
-    assert not os.path.isdir(path)
+        assert os.path.isdir(path) is True
+    assert os.path.isdir(path) is False
 
     # Check the behavior when a path is provided
     tmp_dir = TemporaryDirectory()
     with download_path(tmp_dir.name) as path:
-        assert os.path.isdir(path)
-    assert os.path.isdir(path)
+        assert os.path.isdir(path) is True
+    assert os.path.isdir(path) is True
diff --git a/tests/unit/recommenders/datasets/test_pandas_df_utils.py b/tests/unit/recommenders/datasets/test_pandas_df_utils.py
@@ -235,10 +235,10 @@ def test_has_columns():
     df_1 = pd.DataFrame(dict(a=[1, 2, 3]))
     df_2 = pd.DataFrame(dict(b=[7, 8, 9], a=[1, 2, 3]))
 
-    assert has_columns(df_1, ["a"])
-    assert has_columns(df_2, ["a"])
-    assert has_columns(df_2, ["a", "b"])
-    assert not has_columns(df_2, ["a", "b", "c"])
+    assert has_columns(df_1, ["a"]) is True
+    assert has_columns(df_2, ["a"]) is True
+    assert has_columns(df_2, ["a", "b"]) is True
+    assert has_columns(df_2, ["a", "b", "c"]) is False
 
 
 def test_has_same_base_dtype():
@@ -256,19 +256,19 @@ def test_has_same_base_dtype():
     df_6 = pd.DataFrame(dict(a=arr_str))
 
     # all columns match
-    assert has_same_base_dtype(df_1, df_2)
+    assert has_same_base_dtype(df_1, df_2) is True
     # specific column matches
-    assert has_same_base_dtype(df_3, df_4, columns=["a"])
+    assert has_same_base_dtype(df_3, df_4, columns=["a"]) is True
     # some column types do not match
-    assert not has_same_base_dtype(df_3, df_4)
+    assert has_same_base_dtype(df_3, df_4) is False
     # column types do not match
-    assert not has_same_base_dtype(df_1, df_3, columns=["a"])
+    assert has_same_base_dtype(df_1, df_3, columns=["a"]) is False
     # all columns are not shared
-    assert not has_same_base_dtype(df_4, df_5)
+    assert has_same_base_dtype(df_4, df_5) is False
     # column types do not match
-    assert not has_same_base_dtype(df_5, df_6, columns=["a"])
+    assert has_same_base_dtype(df_5, df_6, columns=["a"]) is False
     # assert string columns match
-    assert has_same_base_dtype(df_6, df_6)
+    assert has_same_base_dtype(df_6, df_6) is True
 
 
 def test_lru_cache_df():

diff --git a/tests/unit/recommenders/datasets/test_spark_splitter.py b/tests/unit/recommenders/datasets/test_spark_splitter.py
@@ -80,8 +80,8 @@ def test_min_rating_filter(spark_dataset):
         x["count"] >= 5 for x in dfs_item.groupBy(DEFAULT_ITEM_COL).count().collect()
     ]
 
-    assert all(user_rating_counts)
-    assert all(item_rating_counts)
+    assert all(user_rating_counts) is True
+    assert all(item_rating_counts) is True
 
 
 @pytest.mark.spark
@@ -123,16 +123,16 @@ def test_chrono_splitter(spark_dataset):
 
     assert set(users_train) == set(users_test)
 
-    assert _if_later(splits[0], splits[1])
+    assert _if_later(splits[0], splits[1]) is True
 
     splits = spark_chrono_split(spark_dataset, ratio=RATIOS)
 
     assert splits[0].count() / NUM_ROWS == pytest.approx(RATIOS[0], TOL)
     assert splits[1].count() / NUM_ROWS == pytest.approx(RATIOS[1], TOL)
     assert splits[2].count() / NUM_ROWS == pytest.approx(RATIOS[2], TOL)
 
-    assert _if_later(splits[0], splits[1])
-    assert _if_later(splits[1], splits[2])
+    assert _if_later(splits[0], splits[1]) is True
+    assert _if_later(splits[1], splits[2]) is True
 
 
 @pytest.mark.spark

diff --git a/tests/unit/recommenders/evaluation/test_python_evaluation.py b/tests/unit/recommenders/evaluation/test_python_evaluation.py
@@ -90,7 +90,7 @@ def test_column_dtypes_match(rating_true, rating_pred):
             col_rating=DEFAULT_RATING_COL,
             col_prediction=DEFAULT_PREDICTION_COL,
         )
-    
+
     # Drop a column, and there should column mismatch error produced
     rating_true.drop(DEFAULT_USER_COL, axis="columns", inplace=True)
     with pytest.raises(ColumnMismatchError):
@@ -375,10 +375,16 @@ def test_python_r_precision(rating_true, rating_pred, rating_nohit):
         k=10,
     ) == pytest.approx(1, TOL)
     assert r_precision_at_k(rating_true, rating_nohit, k=5) == 0.0
-    assert r_precision_at_k(rating_true, rating_pred, k=3) == pytest.approx(0.21111, TOL)
-    assert r_precision_at_k(rating_true, rating_pred, k=5) == pytest.approx(0.24444, TOL)
+    assert r_precision_at_k(rating_true, rating_pred, k=3) == pytest.approx(
+        0.21111, TOL
+    )
+    assert r_precision_at_k(rating_true, rating_pred, k=5) == pytest.approx(
+        0.24444, TOL
+    )
     # Equivalent to precision
-    assert r_precision_at_k(rating_true, rating_pred, k=10) == pytest.approx(0.37777, TOL)
+    assert r_precision_at_k(rating_true, rating_pred, k=10) == pytest.approx(
+        0.37777, TOL
+    )
 
 
 def test_python_auc(rating_true_binary, rating_pred_binary):
@@ -522,9 +528,7 @@ def test_user_diversity(diversity_data):
         col_relevance=None,
     )
     assert_frame_equal(
-        pd.DataFrame(
-            dict(UserId=[1, 2, 3], user_diversity=[0.29289, 1.0, 0.0])
-        ),
+        pd.DataFrame(dict(UserId=[1, 2, 3], user_diversity=[0.29289, 1.0, 0.0])),
         actual,
         check_exact=False,
         atol=TOL,
@@ -625,9 +629,7 @@ def test_user_diversity_item_feature_vector(diversity_data):
         col_relevance=None,
     )
     assert_frame_equal(
-        pd.DataFrame(
-            dict(UserId=[1, 2, 3], user_diversity=[0.5000, 0.5000, 0.5000])
-        ),
+        pd.DataFrame(dict(UserId=[1, 2, 3], user_diversity=[0.5000, 0.5000, 0.5000])),
         actual,
         check_exact=False,
     )
@@ -695,9 +697,7 @@ def test_user_serendipity_item_feature_vector(diversity_data):
         col_relevance="Relevance",
     )
     assert_frame_equal(
-        pd.DataFrame(
-            dict(UserId=[1, 2, 3], user_serendipity=[0.2500, 0.625, 0.3333])
-        ),
+        pd.DataFrame(dict(UserId=[1, 2, 3], user_serendipity=[0.2500, 0.625, 0.3333])),
         actual,
         check_exact=False,
         atol=TOL,

diff --git a/tests/unit/recommenders/models/test_deeprec_utils.py b/tests/unit/recommenders/models/test_deeprec_utils.py
@@ -29,7 +29,8 @@ def test_prepare_hparams(deeprec_resource_path, must_exist_attributes):
             "xdeepfmresources.zip",
         )
     hparams = prepare_hparams(yaml_file)
-    assert hasattr(hparams, must_exist_attributes)
+
+    assert hasattr(hparams, must_exist_attributes) is True
 
 
 @pytest.mark.gpu
@@ -43,6 +44,6 @@ def test_load_yaml_file(deeprec_resource_path):
             data_path,
             "xdeepfmresources.zip",
         )
-
     config = load_yaml(yaml_file)
+
     assert config is not None
diff --git a/tests/unit/recommenders/models/test_sar_singlenode.py b/tests/unit/recommenders/models/test_sar_singlenode.py
@@ -23,7 +23,7 @@ def test_init(header):
     assert model.col_prediction == "prediction"
     assert model.similarity_type == "jaccard"
     assert model.time_decay_half_life == 2592000
-    assert not model.time_decay_flag
+    assert model.time_decay_flag is False
     assert model.time_now is None
     assert model.threshold == 1
 
@@ -53,7 +53,7 @@ def test_predict(
     preds = model.predict(testset)
 
     assert len(preds) == 2
-    assert isinstance(preds, pd.DataFrame)
+    assert isinstance(preds, pd.DataFrame) is True
     assert preds[header["col_user"]].dtype == trainset[header["col_user"]].dtype
     assert preds[header["col_item"]].dtype == trainset[header["col_item"]].dtype
     assert preds[DEFAULT_PREDICTION_COL].dtype == trainset[header["col_rating"]].dtype
@@ -375,7 +375,7 @@ def test_get_normalized_scores(header):
     )
 
     assert actual.shape == (2, 7)
-    assert isinstance(actual, np.ndarray)
+    assert isinstance(actual, np.ndarray) is True
     assert np.isclose(expected, np.asarray(actual)).all()
 
 

diff --git a/tests/unit/recommenders/utils/test_gpu_utils.py b/tests/unit/recommenders/utils/test_gpu_utils.py
@@ -47,7 +47,7 @@ def test_get_cudnn_version():
 
 @pytest.mark.gpu
 def test_cudnn_enabled():
-    assert torch.backends.cudnn.enabled == True
+    assert torch.backends.cudnn.enabled is True
 
 
 @pytest.mark.gpu