Add preprocessing argument to Evaluation methods

The ``Evaluation`` class didn't include the "prep" argument to disable/enable the default preprocessing method. As a consequence, users were not be able to perform any hyperparameter optimization o model evaluation using only their custom preprocessing method. Therefore, the "prep" argument was added to the ``grid_search()`, ``test()``, and ``kfold_cross_validation()`` functions of the ``Evaluation`` class. Plus, the tutorial notebook for custom preprocessing was updated to include an example of how to use the ``Evaluation.grid_search()`` to find the best hyperparameter values with the new "prep" argument.
sergioburdisso · May 8, 2020 · 7c6b0c6 · 7c6b0c6
1 parent a847b28
commit 7c6b0c6
Show file tree

Hide file tree

Showing 2 changed files with 130 additions and 16 deletions.
diff --git a/examples/using_custom_preprocessing.ipynb b/examples/using_custom_preprocessing.ipynb
@@ -106,6 +106,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# In the \"Hyperparameter Optimization\" section at the bottom,\n",
+    "# it is shown how we obtained these hyperparemter values: s=.44, l=.48, p=0.5\n",
     "clf = SS3(s=.44, l=.48, p=0.5)\n",
     "\n",
     "# Let the training begin!\n",
@@ -198,6 +200,109 @@
    "source": [
     "... and... that's it for now, well done!  :D"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## Hyperparameter Optimization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "clf = SS3(name=\"movie-reviews\")\n",
+    "\n",
+    "# to speed up the process, we won't use 3-gram but single words\n",
+    "# (i.e. we won't use the n_grams=3 argument)\n",
+    "clf.train(x_train_prep, y_train, prep=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "best_s, best_l, best_p, best_a = Evaluation.grid_search(\n",
+    "    clf, x_test_prep, y_test,\n",
+    "    s=span(0.2, 0.8, 6),\n",
+    "    l=span(0.1, 2, 6),\n",
+    "    p=span(0.5, 2, 6),\n",
+    "    a=[0, .1, .2],\n",
+    "    prep=False,  # <- do not forget to disable default preprocessing\n",
+    "    tag=\"grid search (test)\"\n",
+    ")\n",
+    "\n",
+    "print(\"The hyperparameter values that obtained the best Accuracy are:\")\n",
+    "print(\"Smoothness(s):\", best_s)\n",
+    "print(\"Significance(l):\", best_l)\n",
+    "print(\"Sanction(p):\", best_p)\n",
+    "print(\"Alpha(a):\", best_a)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Evaluation.plot()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "clf.set_hyperparameters(0.44, 0.48, 0.5, 0.0)\n",
+    "y_pred = clf.predict(x_test_prep, prep=False)\n",
+    "\n",
+    "accuracy = accuracy_score(y_pred, y_test)\n",
+    "print(\"Accuracy was:\", accuracy)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The best accuracy with the obtained hyperparameters is 0.828. Now let's train a 3-grams version using the same hyperparameters:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "clf = SS3(0.44, 0.48, 0.5, 0.0, name=\"movie-reviews\")\n",
+    "\n",
+    "clf.train(x_train_prep, y_train, n_grams=3, prep=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y_pred = clf.predict(x_test_prep, prep=False)\n",
+    "\n",
+    "accuracy = accuracy_score(y_pred, y_test)\n",
+    "print(\"Accuracy was:\", accuracy)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The accuracy improved! it went from .828 to .853 :)"
+   ]
   }
  ],
  "metadata": {

diff --git a/pyss3/util.py b/pyss3/util.py
@@ -681,7 +681,7 @@ def __evaluation_result__(
     def __grid_search_loop__(
         clf, x_test, y_test, ss, ll, pp, aa, k_fold,
         i_fold, def_cat, tag, categories, cache=True,
-        leave_pbar=True, extended_pbar=False, desc_pbar=None
+        leave_pbar=True, extended_pbar=False, desc_pbar=None, prep=True
     ):
         """Grid search main loop."""
         method = Evaluation.__kfold2method__(k_fold)
@@ -731,7 +731,8 @@ def __grid_search_loop__(
                     )
 
                     y_pred = clf.predict(
-                        x_test, def_cat, labels=False, leave_pbar=False
+                        x_test, def_cat,
+                        labels=False, leave_pbar=False, prep=prep
                     )
 
                     Evaluation.__evaluation_result__(
@@ -1121,7 +1122,7 @@ def remove(s=None, l=None, p=None, a=None, method=None, def_cat=None, tag=None,
 
     @staticmethod
     def test(
-        clf, x_test, y_test, def_cat=STR_MOST_PROBABLE,
+        clf, x_test, y_test, def_cat=STR_MOST_PROBABLE, prep=True,
         tag=None, plot=True, metric='accuracy', metric_target='macro avg', cache=True
     ):
         """
@@ -1150,6 +1151,9 @@ def test(
                         'most-probable', 'unknown' or a given category name.
                         (default: 'most-probable')
         :type def_cat: str
+        :param prep: enables the default input preprocessing when classifying
+                     (default: True)
+        :type prep: bool
         :param tag: the cache tag to be used, i.e. a string to identify this evaluation
                     inside the cache storage (optional)
         :type tag: str
@@ -1188,7 +1192,7 @@ def test(
         # if not cached
         if not y_pred:
             clf.set_hyperparameters(s, l, p, a)
-            y_pred = clf.predict(x_test, def_cat, labels=False)
+            y_pred = clf.predict(x_test, def_cat, prep=prep, labels=False)
             categories = clf.get_categories()
             y_test = [clf.get_category_index(y) for y in y_test]
         else:
@@ -1203,9 +1207,8 @@ def test(
 
     @staticmethod
     def kfold_cross_validation(
-        clf, x_train, y_train, k=4, n_grams=None,
-        def_cat=STR_MOST_PROBABLE, tag=None, plot=True,
-        metric='accuracy', metric_target='macro avg', cache=True
+        clf, x_train, y_train, k=4, n_grams=None, def_cat=STR_MOST_PROBABLE, prep=True,
+        tag=None, plot=True, metric='accuracy', metric_target='macro avg', cache=True
     ):
         """
         Perform a Stratified k-fold cross validation on the given training set.
@@ -1243,6 +1246,9 @@ def kfold_cross_validation(
                         'most-probable', 'unknown' or a given category name.
                         (default: 'most-probable')
         :type def_cat: str
+        :param prep: enables the default input preprocessing when classifying
+                     (default: True)
+        :type prep: bool
         :param tag: the cache tag to be used, i.e. a string to identify this evaluation
                     inside the cache storage (optional)
         :type tag: str
@@ -1304,12 +1310,12 @@ def kfold_cross_validation(
                 clf_fold.set_hyperparameters(s, l, p, a)
                 Print.verbosity_region_begin(VERBOSITY.QUIET)
                 progress_bar.set_description_str(pbar_desc + " [training...]")
-                clf_fold.fit(x_train_fold, y_train_fold, n_grams, leave_pbar=False)
+                clf_fold.fit(x_train_fold, y_train_fold, n_grams,
+                             prep=prep, leave_pbar=False)
 
                 progress_bar.set_description_str(pbar_desc + " [classifying...]")
-                y_pred = clf_fold.predict(
-                    x_test_fold, def_cat, labels=False, leave_pbar=False
-                )
+                y_pred = clf_fold.predict(x_test_fold, def_cat,
+                                          prep=prep, labels=False, leave_pbar=False)
                 Print.verbosity_region_end()
 
                 Evaluation.__evaluation_result__(
@@ -1334,8 +1340,8 @@ def kfold_cross_validation(
     @staticmethod
     def grid_search(
         clf, x_data, y_data, s=None, l=None, p=None, a=None,
-        k_fold=None, n_grams=None, def_cat=STR_MOST_PROBABLE, tag=None,
-        metric='accuracy', metric_target='macro avg', cache=True, extended_pbar=False
+        k_fold=None, n_grams=None, def_cat=STR_MOST_PROBABLE, prep=True,
+        tag=None, metric='accuracy', metric_target='macro avg', cache=True, extended_pbar=False
     ):
         """
         Perform a grid search using the provided hyperparameter values.
@@ -1418,6 +1424,9 @@ def grid_search(
                         'most-probable', 'unknown' or a given category name.
                         (default: 'most-probable')
         :type def_cat: str
+        :param prep: enables the default input preprocessing when classifying
+                     (default: True)
+        :type prep: bool
         :param tag: the cache tag to be used, i.e. a string to identify this evaluation
                     inside the cache storage (optional)
         :type tag: str
@@ -1464,7 +1473,7 @@ def grid_search(
             Evaluation.__grid_search_loop__(
                 clf, x_test, y_test, s, l, p, a, 1, 0,
                 def_cat, tag, clf.get_categories(), cache,
-                extended_pbar=extended_pbar
+                extended_pbar=extended_pbar, prep=prep
             )
         else:  # if k-fold
             Print.verbosity_region_begin(VERBOSITY.NORMAL)
@@ -1481,13 +1490,13 @@ def grid_search(
                 categories = clf.get_categories()
 
                 clf_fold = SS3()
-                clf_fold.fit(x_train, y_train, n_grams, leave_pbar=False)
+                clf_fold.fit(x_train, y_train, n_grams, prep=prep, leave_pbar=False)
 
                 Evaluation.__grid_search_loop__(
                     clf_fold, x_test, y_test, s, l, p, a, k_fold, i_fold,
                     def_cat, tag, categories, cache,
                     leave_pbar=False, extended_pbar=extended_pbar,
-                    desc_pbar="[fold %d/%d] Grid search" % (i_fold + 1, k_fold)
+                    desc_pbar="[fold %d/%d] Grid search" % (i_fold + 1, k_fold), prep=prep
                 )
                 Evaluation.__cache_update__()