From ab83f49492cad99abc35a425ef9acaf2be13125e Mon Sep 17 00:00:00 2001 From: shaigrt Date: Sun, 26 May 2024 19:16:52 +0300 Subject: [PATCH] Adding go emotions dataset (#865) --- prepare/cards/go_emotions.py | 56 ++++++++++ .../catalog/cards/go_emotions/simplified.json | 103 ++++++++++++++++++ src/unitxt/operators.py | 2 +- 3 files changed, 160 insertions(+), 1 deletion(-) create mode 100644 prepare/cards/go_emotions.py create mode 100644 src/unitxt/catalog/cards/go_emotions/simplified.json diff --git a/prepare/cards/go_emotions.py b/prepare/cards/go_emotions.py new file mode 100644 index 000000000..c0d7dc665 --- /dev/null +++ b/prepare/cards/go_emotions.py @@ -0,0 +1,56 @@ +from datasets import load_dataset_builder +from unitxt import add_to_catalog +from unitxt.blocks import ( + AddFields, + LoadHF, + TaskCard, +) +from unitxt.operators import MapInstanceValues +from unitxt.test_utils.card import test_card + +dataset_name = "go_emotions" +subset = "simplified" + +ds_builder = load_dataset_builder(dataset_name, subset) +classes = ds_builder.info.features["labels"].feature.names + +mappers = {str(i): cls for i, cls in enumerate(classes)} + +card = TaskCard( + loader=LoadHF(path=dataset_name, name=subset), + preprocess_steps=[ + MapInstanceValues(mappers={"labels": mappers}, process_every_value=True), + AddFields( + fields={ + "classes": classes, + "text_type": "text", + "type_of_classes": "emotions", + } + ), + ], + task="tasks.classification.multi_label", + templates="templates.classification.multi_label.all", + __tags__={ + "annotations_creators": "crowdsourced", + "arxiv": "2005.00547", + "croissant": True, + "language": "en", + "language_creators": "found", + "license": "apache-2.0", + "multilinguality": "monolingual", + "region": "us", + "size_categories": "10K