Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added time performance tests #1765

Merged
merged 12 commits into from
Jul 11, 2022
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

import numpy as np
import pandas as pd
import pytest
from sklearn.preprocessing import minmax_scale

from recommenders.utils.constants import (
DEFAULT_USER_COL,
DEFAULT_ITEM_COL,
DEFAULT_RATING_COL,
DEFAULT_PREDICTION_COL,
SEED,
)
from recommenders.evaluation.python_evaluation import (
merge_rating_true_pred,
merge_ranking_true_pred,
rmse,
mae,
rsquared,
exp_var,
get_top_k_items,
precision_at_k,
recall_at_k,
ndcg_at_k,
map_at_k,
auc,
logloss,
)
import random
from recommenders.utils.timer import Timer

random.seed(SEED)
DATA_USER_NUM = 10000
DATA_ITEM_NUM = DATA_USER_NUM * 2
DATA_SAMPLE_NUM = DATA_USER_NUM * 1000
DATA_RATING_MAX = 5

# fmt: off
@pytest.fixture
def rating_true():
return pd.DataFrame(
{
DEFAULT_USER_COL: np.random.choice(range(0, DATA_USER_NUM), DATA_SAMPLE_NUM),
DEFAULT_ITEM_COL: np.random.choice(range(0, DATA_ITEM_NUM), DATA_SAMPLE_NUM),
DEFAULT_RATING_COL: np.random.choice(range(1, DATA_RATING_MAX+1), DATA_SAMPLE_NUM)
}
)


@pytest.fixture
def rating_pred():
return pd.DataFrame(
{
DEFAULT_USER_COL: np.random.choice(range(0, DATA_USER_NUM), DATA_SAMPLE_NUM),
DEFAULT_ITEM_COL: np.random.choice(range(0, DATA_ITEM_NUM), DATA_SAMPLE_NUM),
DEFAULT_PREDICTION_COL: np.random.choice(range(1, DATA_RATING_MAX+1), DATA_SAMPLE_NUM)
}
)
# fmt: on


@pytest.fixture
def rating_true_binary(rating_true):
# Convert true ratings to binary
rating_true[DEFAULT_RATING_COL] = rating_true[DEFAULT_RATING_COL].apply(
lambda x: 1.0 if x >= 3 else 0.0
)
return rating_true


@pytest.fixture
def rating_pred_binary(rating_pred):
# Normalize the predictions
rating_pred[DEFAULT_PREDICTION_COL] = minmax_scale(
rating_pred[DEFAULT_PREDICTION_COL].astype(float)
)
return rating_pred


# The following time thresholds are benchmarked on Azure
# Standard_D14_v2 VM, with Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz,
# 16 cores, 112 GB RAM, 800 GB disk.
# The thresholds are calculated by MEAN + 5 * STANDARD DEVIATION.


def test_merge_rating(rating_true, rating_pred):
with Timer() as t:
merge_rating_true_pred(
rating_true,
rating_pred,
col_user=DEFAULT_USER_COL,
col_item=DEFAULT_ITEM_COL,
col_rating=DEFAULT_RATING_COL,
col_prediction=DEFAULT_PREDICTION_COL,
)
assert t.interval < 19.58985256


def test_merge_ranking(rating_true, rating_pred):
with Timer() as t:
merge_ranking_true_pred(
rating_true,
rating_pred,
col_user=DEFAULT_USER_COL,
col_item=DEFAULT_ITEM_COL,
col_rating=DEFAULT_RATING_COL,
col_prediction=DEFAULT_PREDICTION_COL,
relevancy_method="top_k",
)
assert t.interval < 21.52722161


def test_python_rmse(rating_true, rating_pred):
with Timer() as t:
rmse(
rating_true=rating_true,
rating_pred=rating_pred,
col_prediction=DEFAULT_PREDICTION_COL,
)
assert t.interval < 29.95031411


def test_python_mae(rating_true, rating_pred):
with Timer() as t:
mae(
rating_true=rating_true,
rating_pred=rating_pred,
col_prediction=DEFAULT_PREDICTION_COL,
)
assert t.interval < 30.45756622


def test_python_rsquared(rating_true, rating_pred):
with Timer() as t:
rsquared(
rating_true=rating_true,
rating_pred=rating_pred,
col_prediction=DEFAULT_PREDICTION_COL,
)
assert t.interval < 30.60572284


def test_python_exp_var(rating_true, rating_pred):
with Timer() as t:
exp_var(
rating_true=rating_true,
rating_pred=rating_pred,
col_prediction=DEFAULT_PREDICTION_COL,
)
assert t.interval < 31.01451915


def test_get_top_k_items(rating_true):
with Timer() as t:
get_top_k_items(
dataframe=rating_true,
col_user=DEFAULT_USER_COL,
col_rating=DEFAULT_RATING_COL,
k=10,
)
assert t.interval < 2.94850964


def test_get_top_k_items_largek(rating_true):
with Timer() as t:
get_top_k_items(
dataframe=rating_true,
col_user=DEFAULT_USER_COL,
col_rating=DEFAULT_RATING_COL,
k=1000,
)
assert t.interval < 4.04160459


def test_python_ndcg_at_k(rating_true, rating_pred):
with Timer() as t:
ndcg_at_k(
rating_true=rating_true,
rating_pred=rating_pred,
col_prediction=DEFAULT_PREDICTION_COL,
k=10,
)
assert t.interval < 21.55627936


def test_python_map_at_k(rating_true, rating_pred):
with Timer() as t:
map_at_k(
rating_true=rating_true,
rating_pred=rating_pred,
col_prediction=DEFAULT_PREDICTION_COL,
k=10,
)
assert t.interval < 29.90376154


def test_python_precision(rating_true, rating_pred):
with Timer() as t:
precision_at_k(rating_true, rating_pred, k=10)
assert t.interval < 29.95129834


def test_python_recall(rating_true, rating_pred):
with Timer() as t:
recall_at_k(
rating_true=rating_true,
rating_pred=rating_pred,
col_prediction=DEFAULT_PREDICTION_COL,
k=10,
)
assert t.interval < 30.29558967


def test_python_auc(rating_true_binary, rating_pred_binary):
with Timer() as t:
auc(
rating_true=rating_true_binary,
rating_pred=rating_pred_binary,
col_rating=DEFAULT_RATING_COL,
col_prediction=DEFAULT_PREDICTION_COL,
)
assert t.interval < 21.53587225


def test_python_logloss(rating_true_binary, rating_pred_binary):
with Timer() as t:
logloss(
rating_true=rating_true_binary,
rating_pred=rating_pred_binary,
col_rating=DEFAULT_RATING_COL,
col_prediction=DEFAULT_PREDICTION_COL,
)
assert t.interval < 32.91629787