CLARIN-PL · ktagowski · Feb 15, 2023 · Feb 15, 2023 · Apr 17, 2023 · laugustyniak
diff --git a/.github/workflows/ghp_deploy.yml b/.github/workflows/ghp_deploy.yml
@@ -2,7 +2,7 @@ name: CD
 on:
   push:
     branches:
-      ["main", "master", "271-create-documentation-and-library-presentation"]
+      ["main", "master", "271-create-documentation-and-library-presentation", "fix/fix-inference-tutorial"]
   workflow_dispatch:
 jobs:
   deploy_ghp:

diff --git a/nbs/01_Tutorials/validate_lightning_models_inference.ipynb b/nbs/01_Tutorials/validate_lightning_models_inference.ipynb
@@ -30,30 +30,24 @@
    "outputs": [],
    "source": [
     "#| hide\n",
-    "%load_ext autoreload\n",
-    "%autoreload 2"
+    "import os\n",
+    "import warnings\n",
+    "\n",
+    "# disable warnings\n",
+    "warnings.simplefilter(\"ignore\")\n",
+    "# set os environ variable for multiprocesses\n",
+    "os.environ[\"PYTHONWARNINGS\"] = \"ignore\"\n",
+    "\n",
+    "os.chdir(\"..\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "1019b750-cebe-438b-b1ab-434d6f756864",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/opt/conda/envs/embeddings/lib/python3.9/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "#| eval: false\n",
-    "import os\n",
-    "\n",
-    "os.chdir(\"..\")\n",
     "from typing import Any, Dict\n",
     "\n",
     "import pytorch_lightning as pl\n",
@@ -75,25 +69,15 @@
    "execution_count": null,
    "id": "2d0e06e2-3c5a-420b-b065-31d5ccd6b255",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2023-02-13 23:05:26,246 - embeddings.utils.utils - WARNING - String 'hf-internal-testing/tiny-albert' contains '/'. Replacing it with '__'. Cleaned_text: hf-internal-testing__tiny-albert.\n",
-      "2023-02-13 23:05:26,247 - embeddings.utils.utils - WARNING - String 'clarin-pl/polemo2-official' contains '/'. Replacing it with '__'. Cleaned_text: clarin-pl__polemo2-official.\n",
-      "2023-02-13 23:05:26,254 - embeddings.utils.utils - WARNING - String 'hf-internal-testing/tiny-albert' contains '/'. Replacing it with '__'. Cleaned_text: hf-internal-testing__tiny-albert.\n",
-      "2023-02-13 23:05:26,256 - embeddings.utils.utils - WARNING - String 'clarin-pl/polemo2-official' contains '/'. Replacing it with '__'. Cleaned_text: clarin-pl__polemo2-official.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "#| eval: false\n",
+    "#|exec_doc\n",
+    "\n",
     "embedding_name_or_path = \"hf-internal-testing/tiny-albert\"\n",
     "dataset_name = \"clarin-pl/polemo2-official\"\n",
     "\n",
     "dataset_path = build_output_path(DATASET_PATH, embedding_name_or_path, dataset_name)\n",
-    "output_path = build_output_path(RESULTS_PATH, embedding_name_or_path, dataset_name)"
+    "output_path = build_output_path(\".\", embedding_name_or_path, dataset_name, mkdirs=True)"
    ]
   },
   {
@@ -109,26 +93,10 @@
    "execution_count": null,
    "id": "095d1c88-900f-4275-a879-f9efdb73265a",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Using custom data configuration default-e0c1ce6ddfd81769\n",
-      "Found cached dataset polemo2-official (/root/.cache/huggingface/datasets/clarin-pl___polemo2-official/default-e0c1ce6ddfd81769/0.0.0/2b75fdbe5def97538e81fb120f8752744b50729a4ce09bd75132bfc863a2fd70)\n",
-      "100%|██████████| 3/3 [00:00<00:00, 817.23it/s]\n",
-      "Loading cached split indices for dataset at /root/.cache/huggingface/datasets/clarin-pl___polemo2-official/default-e0c1ce6ddfd81769/0.0.0/2b75fdbe5def97538e81fb120f8752744b50729a4ce09bd75132bfc863a2fd70/cache-a54edce9681df8b7.arrow and /root/.cache/huggingface/datasets/clarin-pl___polemo2-official/default-e0c1ce6ddfd81769/0.0.0/2b75fdbe5def97538e81fb120f8752744b50729a4ce09bd75132bfc863a2fd70/cache-09cf731207f31628.arrow\n",
-      "Loading cached split indices for dataset at /root/.cache/huggingface/datasets/clarin-pl___polemo2-official/default-e0c1ce6ddfd81769/0.0.0/2b75fdbe5def97538e81fb120f8752744b50729a4ce09bd75132bfc863a2fd70/cache-c48721732fabb729.arrow and /root/.cache/huggingface/datasets/clarin-pl___polemo2-official/default-e0c1ce6ddfd81769/0.0.0/2b75fdbe5def97538e81fb120f8752744b50729a4ce09bd75132bfc863a2fd70/cache-f9d782422a65c7e6.arrow\n",
-      "Loading cached split indices for dataset at /root/.cache/huggingface/datasets/clarin-pl___polemo2-official/default-e0c1ce6ddfd81769/0.0.0/2b75fdbe5def97538e81fb120f8752744b50729a4ce09bd75132bfc863a2fd70/cache-0db6321193feb3ec.arrow and /root/.cache/huggingface/datasets/clarin-pl___polemo2-official/default-e0c1ce6ddfd81769/0.0.0/2b75fdbe5def97538e81fb120f8752744b50729a4ce09bd75132bfc863a2fd70/cache-4e6c26839c3e4adf.arrow\n",
-      "Loading cached processed dataset at /root/.cache/huggingface/datasets/clarin-pl___polemo2-official/default-e0c1ce6ddfd81769/0.0.0/2b75fdbe5def97538e81fb120f8752744b50729a4ce09bd75132bfc863a2fd70/cache-e32b75da1d28bfd0.arrow\n",
-      "Loading cached processed dataset at /root/.cache/huggingface/datasets/clarin-pl___polemo2-official/default-e0c1ce6ddfd81769/0.0.0/2b75fdbe5def97538e81fb120f8752744b50729a4ce09bd75132bfc863a2fd70/cache-98cbedcc70a23855.arrow\n",
-      "Loading cached processed dataset at /root/.cache/huggingface/datasets/clarin-pl___polemo2-official/default-e0c1ce6ddfd81769/0.0.0/2b75fdbe5def97538e81fb120f8752744b50729a4ce09bd75132bfc863a2fd70/cache-b2cbb8ab856bac0f.arrow\n",
-      "                                                                                         \r"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "#| eval: false\n",
+    "#|exec_doc\n",
+    "\n",
     "def preprocess_data(path: str) -> Dict[str, Any]:\n",
     "    pipeline = HuggingFacePreprocessingPipeline(\n",
     "        dataset_name=dataset_name,\n",
@@ -141,7 +109,7 @@
     "        persist_path=path,\n",
     "        sample_missing_splits=None,\n",
     "        ignore_test_subset=False,\n",
-    "        downsample_splits=(0.01, 0.01, 0.05),\n",
+    "        downsample_splits=(0.01, 0.01, 0.01),\n",
     "        seed=441,\n",
     "    )\n",
     "    pipeline.run()\n",
@@ -171,7 +139,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "#| eval: false\n",
+    "#|exec_doc\n",
+    "\n",
     "config = LightningAdvancedConfig(\n",
     "    finetune_last_n_layers=0,\n",
     "    task_train_kwargs={\"max_epochs\": 1, \"deterministic\": True,},\n",
@@ -199,140 +168,10 @@
    "execution_count": null,
    "id": "148a0089-f461-4948-93fa-04f2e34ac9e0",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 1/1 [00:00<00:00,  6.73ba/s]\n",
-      "100%|██████████| 1/1 [00:00<00:00, 25.69ba/s]\n",
-      "100%|██████████| 1/1 [00:00<00:00, 26.51ba/s]\n",
-      "Casting the dataset: 100%|██████████| 1/1 [00:00<00:00, 113.84ba/s]\n",
-      "Casting the dataset: 100%|██████████| 1/1 [00:00<00:00, 68.70ba/s]\n",
-      "Casting the dataset: 100%|██████████| 1/1 [00:00<00:00, 103.15ba/s]\n",
-      "Some weights of the model checkpoint at hf-internal-testing/tiny-albert were not used when initializing AlbertForSequenceClassification: ['predictions.LayerNorm.bias', 'predictions.LayerNorm.weight', 'predictions.bias', 'predictions.decoder.weight', 'predictions.decoder.bias', 'predictions.dense.bias', 'predictions.dense.weight']\n",
-      "- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-      "- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
-      "Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at hf-internal-testing/tiny-albert and are newly initialized: ['classifier.bias', 'albert.pooler.weight', 'albert.pooler.bias', 'classifier.weight']\n",
-      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-      "GPU available: True, used: False\n",
-      "TPU available: False, using: 0 TPU cores\n",
-      "IPU available: False, using: 0 IPUs\n",
-      "/opt/conda/envs/embeddings/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1579: UserWarning: GPU available but not used. Set the gpus flag in your trainer `Trainer(gpus=1)` or script `--gpus=1`.\n",
-      "  rank_zero_warn(\n",
-      "\n",
-      "  | Name          | Type                            | Params\n",
-      "------------------------------------------------------------------\n",
-      "0 | model         | AlbertForSequenceClassification | 352 K \n",
-      "1 | metrics       | MetricCollection                | 0     \n",
-      "2 | train_metrics | MetricCollection                | 0     \n",
-      "3 | val_metrics   | MetricCollection                | 0     \n",
-      "4 | test_metrics  | MetricCollection                | 0     \n",
-      "------------------------------------------------------------------\n",
-      "132       Trainable params\n",
-      "352 K     Non-trainable params\n",
-      "352 K     Total params\n",
-      "1.410     Total estimated model params size (MB)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Validation sanity check:   0%|          | 0/1 [00:00<?, ?it/s]"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/opt/conda/envs/embeddings/lib/python3.9/site-packages/pytorch_lightning/trainer/data_loading.py:111: UserWarning: The dataloader, val_dataloader 0, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 48 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.\n",
-      "  rank_zero_warn(\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "                                                                      "
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/opt/conda/envs/embeddings/lib/python3.9/site-packages/pytorch_lightning/trainer/data_loading.py:111: UserWarning: The dataloader, train_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 48 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.\n",
-      "  rank_zero_warn(\n",
-      "/opt/conda/envs/embeddings/lib/python3.9/site-packages/pytorch_lightning/trainer/data_loading.py:407: UserWarning: The number of training samples (2) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.\n",
-      "  rank_zero_warn(\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Epoch 0: 100%|██████████| 3/3 [00:01<00:00,  2.07it/s, loss=1.39, v_num=, train/BaseLR=5e-6, train/LambdaLR=5e-6, val/MulticlassAccuracy=0.375, val/MulticlassPrecision=0.0938, val/MulticlassRecall=0.250, val/MulticlassF1Score=0.136]"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/opt/conda/envs/embeddings/lib/python3.9/site-packages/pytorch_lightning/trainer/data_loading.py:111: UserWarning: The dataloader, test_dataloader 0, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 48 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.\n",
-      "  rank_zero_warn(\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "Testing: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]--------------------------------------------------------------------------------\n",
-      "DATALOADER:0 TEST RESULTS\n",
-      "{'test/Loss': 1.3823001384735107,\n",
-      " 'test/MulticlassAccuracy': 0.4054054021835327,\n",
-      " 'test/MulticlassF1Score': 0.14423076808452606,\n",
-      " 'test/MulticlassPrecision': 0.10135135054588318,\n",
-      " 'test/MulticlassRecall': 0.25}\n",
-      "--------------------------------------------------------------------------------\n",
-      "Testing: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Restoring states from the checkpoint path at /app/resources/results/hf-internal-testing__tiny-albert/clarin-pl__polemo2-official/20230213_230526/checkpoints/epoch=0-step=1.ckpt\n",
-      "Loaded model weights from checkpoint at /app/resources/results/hf-internal-testing__tiny-albert/clarin-pl__polemo2-official/20230213_230526/checkpoints/epoch=0-step=1.ckpt\n",
-      "/opt/conda/envs/embeddings/lib/python3.9/site-packages/pytorch_lightning/trainer/data_loading.py:111: UserWarning: The dataloader, predict_dataloader 0, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 48 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.\n",
-      "  rank_zero_warn(\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "Predicting: 100%|██████████| 2/2 [00:01<?, ?it/s]\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/app/embeddings/metric/hugging_face_metric.py:27: FutureWarning: load_metric is deprecated and will be removed in the next major version of datasets. Use 'evaluate.load' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate\n",
-      "  datasets.load_metric(metric, **init_kwargs) if isinstance(metric, str) else metric\n",
-      "/opt/conda/envs/embeddings/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
-      "  _warn_prf(average, modifier, msg_start, len(result))\n",
-      "/opt/conda/envs/embeddings/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
-      "  _warn_prf(average, modifier, msg_start, len(result))\n",
-      "/opt/conda/envs/embeddings/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
-      "  _warn_prf(average, modifier, msg_start, len(result))\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "#| eval: false\n",
+    "#|exec_doc\n",
+    "\n",
     "pipeline = LightningClassificationPipeline(\n",
     "    embedding_name_or_path=embedding_name_or_path,\n",
     "    output_path=output_path,\n",
@@ -352,26 +191,17 @@
     "### Load model from chechpoint automatically generated with Trainer"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ee9e824c-00f1-45b0-9e32-1bd33f364f3a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#| eval: false\n",
-    "ckpt_path = output_path / \"checkpoints\" / \"last.ckpt\"\n",
-    "ckpt_path"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "2785fcbc-1c95-4d23-807f-a14569992354",
    "metadata": {},
    "outputs": [],
    "source": [
-    "#| eval: false\n",
+    "#|exec_doc\n",
+    "\n",
+    "ckpt_path = output_path / \"checkpoints\" / \"epoch=0-step=1.ckpt\"\n",
+    "\n",
     "task_from_ckpt = TextClassificationTask.from_checkpoint(\n",
     "    checkpoint_path=ckpt_path, output_path=output_path,\n",
     ")"
@@ -392,7 +222,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "#| eval: false\n",
+    "#|exec_doc\n",
+    "\n",
     "model_from_ckpt = TextClassificationModule.load_from_checkpoint(str(ckpt_path))"
    ]
   },
@@ -429,7 +260,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "#| eval: false\n",
+    "#|exec_doc\n",
+    "\n",
     "test_dataloader = pipeline.datamodule.test_dataloader()\n",
     "preds = task_from_ckpt.predict(test_dataloader)\n",
     "preds"
@@ -450,7 +282,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "#| eval: false\n",
+    "#|exec_doc\n",
+    "\n",
     "task_from_ckpt.trainer.datamodule = pipeline.datamodule\n",
     "preds_with_names = task_from_ckpt.predict(test_dataloader, return_names=True)\n",
     "preds_with_names"
@@ -471,7 +304,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "#| eval: false\n",
+    "#|exec_doc\n",
+    "\n",
     "trainer = pl.Trainer(default_root_dir=str(output_path))\n",
     "preds_from_model = trainer.predict(model_from_ckpt, dataloaders=test_dataloader)\n",
     "preds_from_model"
@@ -480,18 +314,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "embeddings",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
-  },
-  "language_info": {
-   "name": "python",
-   "version": "3.9.16"
-  },
-  "vscode": {
-   "interpreter": {
-    "hash": "86b40992624b6ecf125385760a49d2b554d653d5c84d942a6f4a5512888cc722"
-   }
   }
  },
  "nbformat": 4,