Skip to content

Commit

Permalink
Add preprocessing argument to Evaluation methods
Browse files Browse the repository at this point in the history
The ``Evaluation`` class didn't include the "prep" argument to
disable/enable the default preprocessing method. As a consequence, users
were not be able to perform any hyperparameter optimization o model
evaluation using only their custom preprocessing method. Therefore, the
"prep" argument was added to the ``grid_search()`, ``test()``, and
``kfold_cross_validation()`` functions of the ``Evaluation`` class.

Plus, the tutorial notebook for custom preprocessing was updated to
include an example of how to use the ``Evaluation.grid_search()`` to
find the best hyperparameter values with the new "prep" argument.
  • Loading branch information
sergioburdisso committed May 8, 2020
1 parent a847b28 commit 7c6b0c6
Show file tree
Hide file tree
Showing 2 changed files with 130 additions and 16 deletions.
105 changes: 105 additions & 0 deletions examples/using_custom_preprocessing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,8 @@
"metadata": {},
"outputs": [],
"source": [
"# In the \"Hyperparameter Optimization\" section at the bottom,\n",
"# it is shown how we obtained these hyperparemter values: s=.44, l=.48, p=0.5\n",
"clf = SS3(s=.44, l=.48, p=0.5)\n",
"\n",
"# Let the training begin!\n",
Expand Down Expand Up @@ -198,6 +200,109 @@
"source": [
"... and... that's it for now, well done! :D"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"## Hyperparameter Optimization"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"clf = SS3(name=\"movie-reviews\")\n",
"\n",
"# to speed up the process, we won't use 3-gram but single words\n",
"# (i.e. we won't use the n_grams=3 argument)\n",
"clf.train(x_train_prep, y_train, prep=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"best_s, best_l, best_p, best_a = Evaluation.grid_search(\n",
" clf, x_test_prep, y_test,\n",
" s=span(0.2, 0.8, 6),\n",
" l=span(0.1, 2, 6),\n",
" p=span(0.5, 2, 6),\n",
" a=[0, .1, .2],\n",
" prep=False, # <- do not forget to disable default preprocessing\n",
" tag=\"grid search (test)\"\n",
")\n",
"\n",
"print(\"The hyperparameter values that obtained the best Accuracy are:\")\n",
"print(\"Smoothness(s):\", best_s)\n",
"print(\"Significance(l):\", best_l)\n",
"print(\"Sanction(p):\", best_p)\n",
"print(\"Alpha(a):\", best_a)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"Evaluation.plot()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"clf.set_hyperparameters(0.44, 0.48, 0.5, 0.0)\n",
"y_pred = clf.predict(x_test_prep, prep=False)\n",
"\n",
"accuracy = accuracy_score(y_pred, y_test)\n",
"print(\"Accuracy was:\", accuracy)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The best accuracy with the obtained hyperparameters is 0.828. Now let's train a 3-grams version using the same hyperparameters:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"clf = SS3(0.44, 0.48, 0.5, 0.0, name=\"movie-reviews\")\n",
"\n",
"clf.train(x_train_prep, y_train, n_grams=3, prep=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"y_pred = clf.predict(x_test_prep, prep=False)\n",
"\n",
"accuracy = accuracy_score(y_pred, y_test)\n",
"print(\"Accuracy was:\", accuracy)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The accuracy improved! it went from .828 to .853 :)"
]
}
],
"metadata": {
Expand Down
41 changes: 25 additions & 16 deletions pyss3/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -681,7 +681,7 @@ def __evaluation_result__(
def __grid_search_loop__(
clf, x_test, y_test, ss, ll, pp, aa, k_fold,
i_fold, def_cat, tag, categories, cache=True,
leave_pbar=True, extended_pbar=False, desc_pbar=None
leave_pbar=True, extended_pbar=False, desc_pbar=None, prep=True
):
"""Grid search main loop."""
method = Evaluation.__kfold2method__(k_fold)
Expand Down Expand Up @@ -731,7 +731,8 @@ def __grid_search_loop__(
)

y_pred = clf.predict(
x_test, def_cat, labels=False, leave_pbar=False
x_test, def_cat,
labels=False, leave_pbar=False, prep=prep
)

Evaluation.__evaluation_result__(
Expand Down Expand Up @@ -1121,7 +1122,7 @@ def remove(s=None, l=None, p=None, a=None, method=None, def_cat=None, tag=None,

@staticmethod
def test(
clf, x_test, y_test, def_cat=STR_MOST_PROBABLE,
clf, x_test, y_test, def_cat=STR_MOST_PROBABLE, prep=True,
tag=None, plot=True, metric='accuracy', metric_target='macro avg', cache=True
):
"""
Expand Down Expand Up @@ -1150,6 +1151,9 @@ def test(
'most-probable', 'unknown' or a given category name.
(default: 'most-probable')
:type def_cat: str
:param prep: enables the default input preprocessing when classifying
(default: True)
:type prep: bool
:param tag: the cache tag to be used, i.e. a string to identify this evaluation
inside the cache storage (optional)
:type tag: str
Expand Down Expand Up @@ -1188,7 +1192,7 @@ def test(
# if not cached
if not y_pred:
clf.set_hyperparameters(s, l, p, a)
y_pred = clf.predict(x_test, def_cat, labels=False)
y_pred = clf.predict(x_test, def_cat, prep=prep, labels=False)
categories = clf.get_categories()
y_test = [clf.get_category_index(y) for y in y_test]
else:
Expand All @@ -1203,9 +1207,8 @@ def test(

@staticmethod
def kfold_cross_validation(
clf, x_train, y_train, k=4, n_grams=None,
def_cat=STR_MOST_PROBABLE, tag=None, plot=True,
metric='accuracy', metric_target='macro avg', cache=True
clf, x_train, y_train, k=4, n_grams=None, def_cat=STR_MOST_PROBABLE, prep=True,
tag=None, plot=True, metric='accuracy', metric_target='macro avg', cache=True
):
"""
Perform a Stratified k-fold cross validation on the given training set.
Expand Down Expand Up @@ -1243,6 +1246,9 @@ def kfold_cross_validation(
'most-probable', 'unknown' or a given category name.
(default: 'most-probable')
:type def_cat: str
:param prep: enables the default input preprocessing when classifying
(default: True)
:type prep: bool
:param tag: the cache tag to be used, i.e. a string to identify this evaluation
inside the cache storage (optional)
:type tag: str
Expand Down Expand Up @@ -1304,12 +1310,12 @@ def kfold_cross_validation(
clf_fold.set_hyperparameters(s, l, p, a)
Print.verbosity_region_begin(VERBOSITY.QUIET)
progress_bar.set_description_str(pbar_desc + " [training...]")
clf_fold.fit(x_train_fold, y_train_fold, n_grams, leave_pbar=False)
clf_fold.fit(x_train_fold, y_train_fold, n_grams,
prep=prep, leave_pbar=False)

progress_bar.set_description_str(pbar_desc + " [classifying...]")
y_pred = clf_fold.predict(
x_test_fold, def_cat, labels=False, leave_pbar=False
)
y_pred = clf_fold.predict(x_test_fold, def_cat,
prep=prep, labels=False, leave_pbar=False)
Print.verbosity_region_end()

Evaluation.__evaluation_result__(
Expand All @@ -1334,8 +1340,8 @@ def kfold_cross_validation(
@staticmethod
def grid_search(
clf, x_data, y_data, s=None, l=None, p=None, a=None,
k_fold=None, n_grams=None, def_cat=STR_MOST_PROBABLE, tag=None,
metric='accuracy', metric_target='macro avg', cache=True, extended_pbar=False
k_fold=None, n_grams=None, def_cat=STR_MOST_PROBABLE, prep=True,
tag=None, metric='accuracy', metric_target='macro avg', cache=True, extended_pbar=False
):
"""
Perform a grid search using the provided hyperparameter values.
Expand Down Expand Up @@ -1418,6 +1424,9 @@ def grid_search(
'most-probable', 'unknown' or a given category name.
(default: 'most-probable')
:type def_cat: str
:param prep: enables the default input preprocessing when classifying
(default: True)
:type prep: bool
:param tag: the cache tag to be used, i.e. a string to identify this evaluation
inside the cache storage (optional)
:type tag: str
Expand Down Expand Up @@ -1464,7 +1473,7 @@ def grid_search(
Evaluation.__grid_search_loop__(
clf, x_test, y_test, s, l, p, a, 1, 0,
def_cat, tag, clf.get_categories(), cache,
extended_pbar=extended_pbar
extended_pbar=extended_pbar, prep=prep
)
else: # if k-fold
Print.verbosity_region_begin(VERBOSITY.NORMAL)
Expand All @@ -1481,13 +1490,13 @@ def grid_search(
categories = clf.get_categories()

clf_fold = SS3()
clf_fold.fit(x_train, y_train, n_grams, leave_pbar=False)
clf_fold.fit(x_train, y_train, n_grams, prep=prep, leave_pbar=False)

Evaluation.__grid_search_loop__(
clf_fold, x_test, y_test, s, l, p, a, k_fold, i_fold,
def_cat, tag, categories, cache,
leave_pbar=False, extended_pbar=extended_pbar,
desc_pbar="[fold %d/%d] Grid search" % (i_fold + 1, k_fold)
desc_pbar="[fold %d/%d] Grid search" % (i_fold + 1, k_fold), prep=prep
)
Evaluation.__cache_update__()

Expand Down

0 comments on commit 7c6b0c6

Please sign in to comment.