IBM · elronbandel · Feb 28, 2024 · Feb 21, 2024 · Feb 27, 2024 · Feb 27, 2024
diff --git a/prepare/cards/ffqa_filtered.py b/prepare/cards/ffqa_filtered.py
@@ -0,0 +1,124 @@
+from src.unitxt.blocks import (
+    LoadHF,
+    SplitRandomMix,
+    TaskCard,
+)
+from src.unitxt.catalog import add_to_catalog
+from src.unitxt.operators import (
+    AddFields,
+    CopyFields,
+    ExecuteExpression,
+    FilterByCondition,
+    ListFieldValues,
+)
+from src.unitxt.test_utils.card import test_card
+
+"""Filtered version of the WikiQA-Free_Form_QA dataset.
+If you would like to use the full dataset, please copy and modify this card as ffqa.py.
+"""
+
+# Dataset structure:
+# DatasetDict({
+#     2k: Dataset({
+#         features: ['conversations'],
+#         num_rows: 600
+#     })
+#     4k: Dataset({
+#         features: ['conversations'],
+#         num_rows: 600
+#     })
+#     8k: Dataset({
+#         features: ['conversations'],
+#         num_rows: 600
+#     })
+#     16k: Dataset({
+#         features: ['conversations'],
+#         num_rows: 600
+#     })
+# })
+#
+# conversations: [
+#     {
+#         "from": "human",
+#         "tok_len": int,
+#         "value": str,
+#     },
+#     {
+#         "from": "agent",
+#         "tok_len": int,
+#         "value": str,
+#     },
+# ]
+#
+# value (There is also a pattern Document and Question are reversed):
+#   Answer the question based on the information provided in the document given below. The answer should be a single word or a number or a short phrase of few words
+#
+#   Document:
+#   <document>
+#
+#   Question:
+#   <question>
+#
+
+
+# TODO: Remove duplicate data
+
+
+# Some of the data is longer than the name of the split.
+# For example, it may be 9000 tokens even if it is included in an 8k split.
+# Set an upper limit to exclude sentences that are too long.
+# NOTE: Only 8k split has been adjusted for now.
+token_limit_map = {
+    "2k": 2048,
+    "4k": 4096,
+    "8k": 8800,
+    "16k": 16384,
+}
+
+
+def add_card(split: str):
+    card = TaskCard(
+        loader=LoadHF(path="abacusai/WikiQA-Free_Form_QA"),
+        preprocess_steps=[
+            CopyFields(
+                field_to_field={
+                    "conversations/0/value": "inputs",
+                    "conversations/0/tok_len": "inputs_len",
+                    "conversations/1/value": "answer",
+                },
+                use_query=True,
+            ),
+            ListFieldValues(fields=["answer"], to_field="answers"),
+            FilterByCondition(
+                values={"inputs_len": token_limit_map[split]},
+                condition="lt",
+            ),
+            ExecuteExpression(
+                expression='re.search(r"Document:\\s(.*)(\\n\\n|$)", inputs, re.DOTALL).group(1)',
+                imports_list=["re"],
+                to_field="context",
+            ),
+            ExecuteExpression(
+                expression='re.search(r"Question:\\s(.*)(\\n\\n|$)", inputs, re.DOTALL).group(1)',
+                imports_list=["re"],
+                to_field="question",
+            ),
+            AddFields({"context_type": "document"}),
+            SplitRandomMix(
+                {
+                    "train": f"{split}[80%]",
+                    "validation": f"{split}[10%]",
+                    "test": f"{split}[10%]",
+                }
+            ),
+        ],
+        task="tasks.qa.with_context.extractive",
+        templates="templates.qa.with_context.all",
+    )
+
+    test_card(card)
+    add_to_catalog(card, f"cards.ffqa_filtered.{split}", overwrite=True)
+
+
+for split in ["2k", "4k", "8k", "16k"]:
+    add_card(split)
diff --git a/prepare/templates/qa/with_context.py b/prepare/templates/qa/with_context.py
@@ -37,13 +37,29 @@
     overwrite=True,
 )
 
+# Template from https://huggingface.co/datasets/abacusai/WikiQA-Free_Form_QA
+add_to_catalog(
+    MultiReferenceTemplate(
+        input_format="""\
+Answer the question based on the information provided in the document given below. The answer should be a single word or a number or a short phrase of few words
+Document: {context}
+Question: {question}
+Answer: """,
+        references_field="answers",
+    ),
+    "templates.qa.with_context.ffqa",
+    overwrite=True,
+)
+
+
 add_to_catalog(
     TemplatesList(
         [
             "templates.qa.with_context.simple",
             "templates.qa.with_context.simple2",
             "templates.qa.with_context.with_type",
             "templates.qa.with_context.question_first",
+            "templates.qa.with_context.ffqa",
         ]
     ),
     "templates.qa.with_context.all",

diff --git a/src/unitxt/catalog/cards/ffqa_filtered/16k.json b/src/unitxt/catalog/cards/ffqa_filtered/16k.json
@@ -0,0 +1,64 @@
+{
+    "type": "task_card",
+    "loader": {
+        "type": "load_hf",
+        "path": "abacusai/WikiQA-Free_Form_QA"
+    },
+    "preprocess_steps": [
+        {
+            "type": "copy_fields",
+            "field_to_field": {
+                "conversations/0/value": "inputs",
+                "conversations/0/tok_len": "inputs_len",
+                "conversations/1/value": "answer"
+            },
+            "use_query": true
+        },
+        {
+            "type": "list_field_values",
+            "fields": [
+                "answer"
+            ],
+            "to_field": "answers"
+        },
+        {
+            "type": "filter_by_condition",
+            "values": {
+                "inputs_len": 16384
+            },
+            "condition": "lt"
+        },
+        {
+            "type": "execute_expression",
+            "expression": "re.search(r\"Document:\\s(.*)(\\n\\n|$)\", inputs, re.DOTALL).group(1)",
+            "imports_list": [
+                "re"
+            ],
+            "to_field": "context"
+        },
+        {
+            "type": "execute_expression",
+            "expression": "re.search(r\"Question:\\s(.*)(\\n\\n|$)\", inputs, re.DOTALL).group(1)",
+            "imports_list": [
+                "re"
+            ],
+            "to_field": "question"
+        },
+        {
+            "type": "add_fields",
+            "fields": {
+                "context_type": "document"
+            }
+        },
+        {
+            "type": "split_random_mix",
+            "mix": {
+                "train": "16k[80%]",
+                "validation": "16k[10%]",
+                "test": "16k[10%]"
+            }
+        }
+    ],
+    "task": "tasks.qa.with_context.extractive",
+    "templates": "templates.qa.with_context.all"
+}
diff --git a/src/unitxt/catalog/cards/ffqa_filtered/2k.json b/src/unitxt/catalog/cards/ffqa_filtered/2k.json
@@ -0,0 +1,64 @@
+{
+    "type": "task_card",
+    "loader": {
+        "type": "load_hf",
+        "path": "abacusai/WikiQA-Free_Form_QA"
+    },
+    "preprocess_steps": [
+        {
+            "type": "copy_fields",
+            "field_to_field": {
+                "conversations/0/value": "inputs",
+                "conversations/0/tok_len": "inputs_len",
+                "conversations/1/value": "answer"
+            },
+            "use_query": true
+        },
+        {
+            "type": "list_field_values",
+            "fields": [
+                "answer"
+            ],
+            "to_field": "answers"
+        },
+        {
+            "type": "filter_by_condition",
+            "values": {
+                "inputs_len": 2048
+            },
+            "condition": "lt"
+        },
+        {
+            "type": "execute_expression",
+            "expression": "re.search(r\"Document:\\s(.*)(\\n\\n|$)\", inputs, re.DOTALL).group(1)",
+            "imports_list": [
+                "re"
+            ],
+            "to_field": "context"
+        },
+        {
+            "type": "execute_expression",
+            "expression": "re.search(r\"Question:\\s(.*)(\\n\\n|$)\", inputs, re.DOTALL).group(1)",
+            "imports_list": [
+                "re"
+            ],
+            "to_field": "question"
+        },
+        {
+            "type": "add_fields",
+            "fields": {
+                "context_type": "document"
+            }
+        },
+        {
+            "type": "split_random_mix",
+            "mix": {
+                "train": "2k[80%]",
+                "validation": "2k[10%]",
+                "test": "2k[10%]"
+            }
+        }
+    ],
+    "task": "tasks.qa.with_context.extractive",
+    "templates": "templates.qa.with_context.all"
+}
diff --git a/src/unitxt/catalog/cards/ffqa_filtered/4k.json b/src/unitxt/catalog/cards/ffqa_filtered/4k.json
@@ -0,0 +1,64 @@
+{
+    "type": "task_card",
+    "loader": {
+        "type": "load_hf",
+        "path": "abacusai/WikiQA-Free_Form_QA"
+    },
+    "preprocess_steps": [
+        {
+            "type": "copy_fields",
+            "field_to_field": {
+                "conversations/0/value": "inputs",
+                "conversations/0/tok_len": "inputs_len",
+                "conversations/1/value": "answer"
+            },
+            "use_query": true
+        },
+        {
+            "type": "list_field_values",
+            "fields": [
+                "answer"
+            ],
+            "to_field": "answers"
+        },
+        {
+            "type": "filter_by_condition",
+            "values": {
+                "inputs_len": 4096
+            },
+            "condition": "lt"
+        },
+        {
+            "type": "execute_expression",
+            "expression": "re.search(r\"Document:\\s(.*)(\\n\\n|$)\", inputs, re.DOTALL).group(1)",
+            "imports_list": [
+                "re"
+            ],
+            "to_field": "context"
+        },
+        {
+            "type": "execute_expression",
+            "expression": "re.search(r\"Question:\\s(.*)(\\n\\n|$)\", inputs, re.DOTALL).group(1)",
+            "imports_list": [
+                "re"
+            ],
+            "to_field": "question"
+        },
+        {
+            "type": "add_fields",
+            "fields": {
+                "context_type": "document"
+            }
+        },
+        {
+            "type": "split_random_mix",
+            "mix": {
+                "train": "4k[80%]",
+                "validation": "4k[10%]",
+                "test": "4k[10%]"
+            }
+        }
+    ],
+    "task": "tasks.qa.with_context.extractive",
+    "templates": "templates.qa.with_context.all"
+}