Skip to content

Commit

Permalink
Adding go emotions dataset (#865)
Browse files Browse the repository at this point in the history
  • Loading branch information
shaigrt committed May 26, 2024
1 parent de631e1 commit ab83f49
Show file tree
Hide file tree
Showing 3 changed files with 160 additions and 1 deletion.
56 changes: 56 additions & 0 deletions prepare/cards/go_emotions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from datasets import load_dataset_builder
from unitxt import add_to_catalog
from unitxt.blocks import (
AddFields,
LoadHF,
TaskCard,
)
from unitxt.operators import MapInstanceValues
from unitxt.test_utils.card import test_card

dataset_name = "go_emotions"
subset = "simplified"

ds_builder = load_dataset_builder(dataset_name, subset)
classes = ds_builder.info.features["labels"].feature.names

mappers = {str(i): cls for i, cls in enumerate(classes)}

card = TaskCard(
loader=LoadHF(path=dataset_name, name=subset),
preprocess_steps=[
MapInstanceValues(mappers={"labels": mappers}, process_every_value=True),
AddFields(
fields={
"classes": classes,
"text_type": "text",
"type_of_classes": "emotions",
}
),
],
task="tasks.classification.multi_label",
templates="templates.classification.multi_label.all",
__tags__={
"annotations_creators": "crowdsourced",
"arxiv": "2005.00547",
"croissant": True,
"language": "en",
"language_creators": "found",
"license": "apache-2.0",
"multilinguality": "monolingual",
"region": "us",
"size_categories": "10K<n<100K",
"source_datasets": "original",
"task_categories": "text-classification",
"task_ids": ["multi-class-classification", "multi-label-classification"],
},
__description__=(
"Dataset Card for Go Emotions\n"
"Dataset Summary\n"
"The GoEmotions dataset contains 58k carefully curated Reddit comments labeled for 27 emotion categories or Neutral.\n"
"This card contains the simplified version of the dataset with predefined train/val/test splits.\n"
"See full description on the dataset page: https://huggingface.co/datasets/go_emotions."
),
)
test_card(card, debug=False)
add_to_catalog(card, f"cards.{dataset_name}.{subset}", overwrite=True)
103 changes: 103 additions & 0 deletions src/unitxt/catalog/cards/go_emotions/simplified.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
{
"type": "task_card",
"loader": {
"type": "load_hf",
"path": "go_emotions",
"name": "simplified"
},
"preprocess_steps": [
{
"type": "map_instance_values",
"mappers": {
"labels": {
"0": "admiration",
"1": "amusement",
"2": "anger",
"3": "annoyance",
"4": "approval",
"5": "caring",
"6": "confusion",
"7": "curiosity",
"8": "desire",
"9": "disappointment",
"10": "disapproval",
"11": "disgust",
"12": "embarrassment",
"13": "excitement",
"14": "fear",
"15": "gratitude",
"16": "grief",
"17": "joy",
"18": "love",
"19": "nervousness",
"20": "optimism",
"21": "pride",
"22": "realization",
"23": "relief",
"24": "remorse",
"25": "sadness",
"26": "surprise",
"27": "neutral"
}
},
"process_every_value": true
},
{
"type": "add_fields",
"fields": {
"classes": [
"admiration",
"amusement",
"anger",
"annoyance",
"approval",
"caring",
"confusion",
"curiosity",
"desire",
"disappointment",
"disapproval",
"disgust",
"embarrassment",
"excitement",
"fear",
"gratitude",
"grief",
"joy",
"love",
"nervousness",
"optimism",
"pride",
"realization",
"relief",
"remorse",
"sadness",
"surprise",
"neutral"
],
"text_type": "text",
"type_of_classes": "emotions"
}
}
],
"task": "tasks.classification.multi_label",
"templates": "templates.classification.multi_label.all",
"__tags__": {
"annotations_creators": "crowdsourced",
"arxiv": "2005.00547",
"croissant": true,
"language": "en",
"language_creators": "found",
"license": "apache-2.0",
"multilinguality": "monolingual",
"region": "us",
"size_categories": "10K<n<100K",
"source_datasets": "original",
"task_categories": "text-classification",
"task_ids": [
"multi-class-classification",
"multi-label-classification"
]
},
"__description__": "Dataset Card for Go Emotions\nDataset Summary\nThe GoEmotions dataset contains 58k carefully curated Reddit comments labeled for 27 emotion categories or Neutral.\nThis card contains the simplified version of the dataset with predefined train/val/test splits.\nSee full description on the dataset page: https://huggingface.co/datasets/go_emotions."
}
2 changes: 1 addition & 1 deletion src/unitxt/operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ class MapInstanceValues(InstanceOperator):
replaces '1' with 'hi' and '2' with 'bye' in field 'a' in all instances of all streams:
instance {"a":"1", "b": 2} becomes {"a":"hi", "b": 2}.
MapInstanceValues(mappers={"a": {"1": "hi", "2": "bye"}}, process_every_element=True)
MapInstanceValues(mappers={"a": {"1": "hi", "2": "bye"}}, process_every_value=True)
Assuming field 'a' is a list of values, potentially including "1"-s and "2"-s, this replaces
each such "1" with "hi" and "2" -- with "bye" in all instances of all streams:
instance {"a": ["1", "2"], "b": 2} becomes {"a": ["hi", "bye"], "b": 2}.
Expand Down

0 comments on commit ab83f49

Please sign in to comment.