Add filtered ffqa dataset

IBM · Feb 26, 2024 · 10cd1dc · 10cd1dc
1 parent 7b50122
commit 10cd1dc
Show file tree

Hide file tree

Showing 5 changed files with 429 additions and 0 deletions.
diff --git a/prepare/cards/ffqa_filtered.py b/prepare/cards/ffqa_filtered.py
@@ -0,0 +1,137 @@
+from src.unitxt.blocks import (
+    InputOutputTemplate,
+    LoadHF,
+    SplitRandomMix,
+    TaskCard,
+    TemplatesList,
+)
+from src.unitxt.catalog import add_to_catalog
+from src.unitxt.operators import (
+    AddFields,
+    CopyFields,
+    ExecuteExpression,
+    FilterByCondition,
+    ListFieldValues,
+)
+from src.unitxt.test_utils.card import test_card
+
+"""Filtered version of the WikiQA-Free_Form_QA dataset.
+If you would like to use the full dataset, please copy and modify this card as ffqa.py.
+"""
+
+# Dataset structure:
+# DatasetDict({
+#     2k: Dataset({
+#         features: ['conversations'],
+#         num_rows: 600
+#     })
+#     4k: Dataset({
+#         features: ['conversations'],
+#         num_rows: 600
+#     })
+#     8k: Dataset({
+#         features: ['conversations'],
+#         num_rows: 600
+#     })
+#     16k: Dataset({
+#         features: ['conversations'],
+#         num_rows: 600
+#     })
+# })
+#
+# conversations: [
+#     {
+#         "from": "human",
+#         "tok_len": int,
+#         "value": str,
+#     },
+#     {
+#         "from": "agent",
+#         "tok_len": int,
+#         "value": str,
+#     },
+# ]
+#
+# value (There is also a pattern Document and Question are reversed):
+#   Answer the question based on the information provided in the document given below. The answer should be a single word or a number or a short phrase of few words
+#
+#   Document:
+#   <document>
+#
+#   Question:
+#   <question>
+#
+
+
+# TODO: Remove duplicate data
+
+
+# Some of the data is longer than the name of the split.
+# For example, it may be 9000 tokens even if it is included in an 8k split.
+# Set an upper limit to exclude sentences that are too long.
+# NOTE: Only 8k split has been adjusted for now.
+token_limit_map = {
+    "2k": 2048,
+    "4k": 4096,
+    "8k": 8800,
+    "16k": 16384,
+}
+
+
+def add_card(split: str):
+    card = TaskCard(
+        loader=LoadHF(path="abacusai/WikiQA-Free_Form_QA"),
+        preprocess_steps=[
+            CopyFields(
+                field_to_field={
+                    "conversations/0/value": "inputs",
+                    "conversations/0/tok_len": "inputs_len",
+                    "conversations/1/value": "answer",
+                },
+                use_query=True,
+            ),
+            ListFieldValues(fields=["answer"], to_field="answers"),
+            FilterByCondition(
+                values={"inputs_len": token_limit_map[split]},
+                condition="lt",
+            ),
+            ExecuteExpression(
+                expression='re.search(r"Document:\\s(.*)(\\n\\n|$)", inputs, re.DOTALL).group(1)',
+                imports_list=["re"],
+                to_field="context",
+            ),
+            ExecuteExpression(
+                expression='re.search(r"Question:\\s(.*)(\\n\\n|$)", inputs, re.DOTALL).group(1)',
+                imports_list=["re"],
+                to_field="question",
+            ),
+            AddFields({"context_type": "document"}),
+            SplitRandomMix(
+                {
+                    "train": f"{split}[80%]",
+                    "validation": f"{split}[10%]",
+                    "test": f"{split}[10%]",
+                }
+            ),
+        ],
+        task="tasks.qa.with_context.extractive",
+        templates=TemplatesList(
+            [
+                InputOutputTemplate(
+                    input_format="""\
+Answer the question based on the information provided in the document given below. The answer should be a single word or a number or a short phrase of few words
+Document: {context}
+Question: {question}
+Answer: """,
+                    output_format="{answers}",
+                ),
+            ]
+        ),
+    )
+
+    test_card(card)
+    add_to_catalog(card, f"cards.ffqa_filtered.{split}")
+
+
+for split in ["2k", "4k", "8k", "16k"]:
+    add_card(split)
diff --git a/src/unitxt/catalog/cards/ffqa_filtered/16k.json b/src/unitxt/catalog/cards/ffqa_filtered/16k.json
@@ -0,0 +1,73 @@
+{
+    "type": "task_card",
+    "loader": {
+        "type": "load_hf",
+        "path": "abacusai/WikiQA-Free_Form_QA"
+    },
+    "preprocess_steps": [
+        {
+            "type": "copy_fields",
+            "field_to_field": {
+                "conversations/0/value": "inputs",
+                "conversations/0/tok_len": "inputs_len",
+                "conversations/1/value": "answer"
+            },
+            "use_query": true
+        },
+        {
+            "type": "list_field_values",
+            "fields": [
+                "answer"
+            ],
+            "to_field": "answers"
+        },
+        {
+            "type": "filter_by_condition",
+            "values": {
+                "inputs_len": 16384
+            },
+            "condition": "lt"
+        },
+        {
+            "type": "execute_expression",
+            "expression": "re.search(r\"Document:\\s(.*)(\\n\\n|$)\", inputs, re.DOTALL).group(1)",
+            "imports_list": [
+                "re"
+            ],
+            "to_field": "context"
+        },
+        {
+            "type": "execute_expression",
+            "expression": "re.search(r\"Question:\\s(.*)(\\n\\n|$)\", inputs, re.DOTALL).group(1)",
+            "imports_list": [
+                "re"
+            ],
+            "to_field": "question"
+        },
+        {
+            "type": "add_fields",
+            "fields": {
+                "context_type": "document"
+            }
+        },
+        {
+            "type": "split_random_mix",
+            "mix": {
+                "train": "16k[80%]",
+                "validation": "16k[10%]",
+                "test": "16k[10%]"
+            }
+        }
+    ],
+    "task": "tasks.qa.with_context.extractive",
+    "templates": {
+        "type": "templates_list",
+        "items": [
+            {
+                "type": "input_output_template",
+                "input_format": "Answer the question based on the information provided in the document given below. The answer should be a single word or a number or a short phrase of few words\nDocument: {context}\nQuestion: {question}\nAnswer: ",
+                "output_format": "{answers}"
+            }
+        ]
+    }
+}
diff --git a/src/unitxt/catalog/cards/ffqa_filtered/2k.json b/src/unitxt/catalog/cards/ffqa_filtered/2k.json
@@ -0,0 +1,73 @@
+{
+    "type": "task_card",
+    "loader": {
+        "type": "load_hf",
+        "path": "abacusai/WikiQA-Free_Form_QA"
+    },
+    "preprocess_steps": [
+        {
+            "type": "copy_fields",
+            "field_to_field": {
+                "conversations/0/value": "inputs",
+                "conversations/0/tok_len": "inputs_len",
+                "conversations/1/value": "answer"
+            },
+            "use_query": true
+        },
+        {
+            "type": "list_field_values",
+            "fields": [
+                "answer"
+            ],
+            "to_field": "answers"
+        },
+        {
+            "type": "filter_by_condition",
+            "values": {
+                "inputs_len": 2048
+            },
+            "condition": "lt"
+        },
+        {
+            "type": "execute_expression",
+            "expression": "re.search(r\"Document:\\s(.*)(\\n\\n|$)\", inputs, re.DOTALL).group(1)",
+            "imports_list": [
+                "re"
+            ],
+            "to_field": "context"
+        },
+        {
+            "type": "execute_expression",
+            "expression": "re.search(r\"Question:\\s(.*)(\\n\\n|$)\", inputs, re.DOTALL).group(1)",
+            "imports_list": [
+                "re"
+            ],
+            "to_field": "question"
+        },
+        {
+            "type": "add_fields",
+            "fields": {
+                "context_type": "document"
+            }
+        },
+        {
+            "type": "split_random_mix",
+            "mix": {
+                "train": "2k[80%]",
+                "validation": "2k[10%]",
+                "test": "2k[10%]"
+            }
+        }
+    ],
+    "task": "tasks.qa.with_context.extractive",
+    "templates": {
+        "type": "templates_list",
+        "items": [
+            {
+                "type": "input_output_template",
+                "input_format": "Answer the question based on the information provided in the document given below. The answer should be a single word or a number or a short phrase of few words\nDocument: {context}\nQuestion: {question}\nAnswer: ",
+                "output_format": "{answers}"
+            }
+        ]
+    }
+}
diff --git a/src/unitxt/catalog/cards/ffqa_filtered/4k.json b/src/unitxt/catalog/cards/ffqa_filtered/4k.json
@@ -0,0 +1,73 @@
+{
+    "type": "task_card",
+    "loader": {
+        "type": "load_hf",
+        "path": "abacusai/WikiQA-Free_Form_QA"
+    },
+    "preprocess_steps": [
+        {
+            "type": "copy_fields",
+            "field_to_field": {
+                "conversations/0/value": "inputs",
+                "conversations/0/tok_len": "inputs_len",
+                "conversations/1/value": "answer"
+            },
+            "use_query": true
+        },
+        {
+            "type": "list_field_values",
+            "fields": [
+                "answer"
+            ],
+            "to_field": "answers"
+        },
+        {
+            "type": "filter_by_condition",
+            "values": {
+                "inputs_len": 4096
+            },
+            "condition": "lt"
+        },
+        {
+            "type": "execute_expression",
+            "expression": "re.search(r\"Document:\\s(.*)(\\n\\n|$)\", inputs, re.DOTALL).group(1)",
+            "imports_list": [
+                "re"
+            ],
+            "to_field": "context"
+        },
+        {
+            "type": "execute_expression",
+            "expression": "re.search(r\"Question:\\s(.*)(\\n\\n|$)\", inputs, re.DOTALL).group(1)",
+            "imports_list": [
+                "re"
+            ],
+            "to_field": "question"
+        },
+        {
+            "type": "add_fields",
+            "fields": {
+                "context_type": "document"
+            }
+        },
+        {
+            "type": "split_random_mix",
+            "mix": {
+                "train": "4k[80%]",
+                "validation": "4k[10%]",
+                "test": "4k[10%]"
+            }
+        }
+    ],
+    "task": "tasks.qa.with_context.extractive",
+    "templates": {
+        "type": "templates_list",
+        "items": [
+            {
+                "type": "input_output_template",
+                "input_format": "Answer the question based on the information provided in the document given below. The answer should be a single word or a number or a short phrase of few words\nDocument: {context}\nQuestion: {question}\nAnswer: ",
+                "output_format": "{answers}"
+            }
+        ]
+    }
+}