diff --git a/prepare/cards/ffqa_filtered.py b/prepare/cards/ffqa_filtered.py new file mode 100644 index 000000000..8d5c66721 --- /dev/null +++ b/prepare/cards/ffqa_filtered.py @@ -0,0 +1,137 @@ +from src.unitxt.blocks import ( + InputOutputTemplate, + LoadHF, + SplitRandomMix, + TaskCard, + TemplatesList, +) +from src.unitxt.catalog import add_to_catalog +from src.unitxt.operators import ( + AddFields, + CopyFields, + ExecuteExpression, + FilterByCondition, + ListFieldValues, +) +from src.unitxt.test_utils.card import test_card + +"""Filtered version of the WikiQA-Free_Form_QA dataset. +If you would like to use the full dataset, please copy and modify this card as ffqa.py. +""" + +# Dataset structure: +# DatasetDict({ +# 2k: Dataset({ +# features: ['conversations'], +# num_rows: 600 +# }) +# 4k: Dataset({ +# features: ['conversations'], +# num_rows: 600 +# }) +# 8k: Dataset({ +# features: ['conversations'], +# num_rows: 600 +# }) +# 16k: Dataset({ +# features: ['conversations'], +# num_rows: 600 +# }) +# }) +# +# conversations: [ +# { +# "from": "human", +# "tok_len": int, +# "value": str, +# }, +# { +# "from": "agent", +# "tok_len": int, +# "value": str, +# }, +# ] +# +# value (There is also a pattern Document and Question are reversed): +# Answer the question based on the information provided in the document given below. The answer should be a single word or a number or a short phrase of few words +# +# Document: +# +# +# Question: +# +# + + +# TODO: Remove duplicate data + + +# Some of the data is longer than the name of the split. +# For example, it may be 9000 tokens even if it is included in an 8k split. +# Set an upper limit to exclude sentences that are too long. +# NOTE: Only 8k split has been adjusted for now. +token_limit_map = { + "2k": 2048, + "4k": 4096, + "8k": 8800, + "16k": 16384, +} + + +def add_card(split: str): + card = TaskCard( + loader=LoadHF(path="abacusai/WikiQA-Free_Form_QA"), + preprocess_steps=[ + CopyFields( + field_to_field={ + "conversations/0/value": "inputs", + "conversations/0/tok_len": "inputs_len", + "conversations/1/value": "answer", + }, + use_query=True, + ), + ListFieldValues(fields=["answer"], to_field="answers"), + FilterByCondition( + values={"inputs_len": token_limit_map[split]}, + condition="lt", + ), + ExecuteExpression( + expression='re.search(r"Document:\\s(.*)(\\n\\n|$)", inputs, re.DOTALL).group(1)', + imports_list=["re"], + to_field="context", + ), + ExecuteExpression( + expression='re.search(r"Question:\\s(.*)(\\n\\n|$)", inputs, re.DOTALL).group(1)', + imports_list=["re"], + to_field="question", + ), + AddFields({"context_type": "document"}), + SplitRandomMix( + { + "train": f"{split}[80%]", + "validation": f"{split}[10%]", + "test": f"{split}[10%]", + } + ), + ], + task="tasks.qa.with_context.extractive", + templates=TemplatesList( + [ + InputOutputTemplate( + input_format="""\ +Answer the question based on the information provided in the document given below. The answer should be a single word or a number or a short phrase of few words +Document: {context} +Question: {question} +Answer: """, + output_format="{answers}", + ), + ] + ), + ) + + test_card(card) + add_to_catalog(card, f"cards.ffqa_filtered.{split}") + + +for split in ["2k", "4k", "8k", "16k"]: + add_card(split) diff --git a/src/unitxt/catalog/cards/ffqa_filtered/16k.json b/src/unitxt/catalog/cards/ffqa_filtered/16k.json new file mode 100644 index 000000000..7e10ba90a --- /dev/null +++ b/src/unitxt/catalog/cards/ffqa_filtered/16k.json @@ -0,0 +1,73 @@ +{ + "type": "task_card", + "loader": { + "type": "load_hf", + "path": "abacusai/WikiQA-Free_Form_QA" + }, + "preprocess_steps": [ + { + "type": "copy_fields", + "field_to_field": { + "conversations/0/value": "inputs", + "conversations/0/tok_len": "inputs_len", + "conversations/1/value": "answer" + }, + "use_query": true + }, + { + "type": "list_field_values", + "fields": [ + "answer" + ], + "to_field": "answers" + }, + { + "type": "filter_by_condition", + "values": { + "inputs_len": 16384 + }, + "condition": "lt" + }, + { + "type": "execute_expression", + "expression": "re.search(r\"Document:\\s(.*)(\\n\\n|$)\", inputs, re.DOTALL).group(1)", + "imports_list": [ + "re" + ], + "to_field": "context" + }, + { + "type": "execute_expression", + "expression": "re.search(r\"Question:\\s(.*)(\\n\\n|$)\", inputs, re.DOTALL).group(1)", + "imports_list": [ + "re" + ], + "to_field": "question" + }, + { + "type": "add_fields", + "fields": { + "context_type": "document" + } + }, + { + "type": "split_random_mix", + "mix": { + "train": "16k[80%]", + "validation": "16k[10%]", + "test": "16k[10%]" + } + } + ], + "task": "tasks.qa.with_context.extractive", + "templates": { + "type": "templates_list", + "items": [ + { + "type": "input_output_template", + "input_format": "Answer the question based on the information provided in the document given below. The answer should be a single word or a number or a short phrase of few words\nDocument: {context}\nQuestion: {question}\nAnswer: ", + "output_format": "{answers}" + } + ] + } +} diff --git a/src/unitxt/catalog/cards/ffqa_filtered/2k.json b/src/unitxt/catalog/cards/ffqa_filtered/2k.json new file mode 100644 index 000000000..b5eabb23a --- /dev/null +++ b/src/unitxt/catalog/cards/ffqa_filtered/2k.json @@ -0,0 +1,73 @@ +{ + "type": "task_card", + "loader": { + "type": "load_hf", + "path": "abacusai/WikiQA-Free_Form_QA" + }, + "preprocess_steps": [ + { + "type": "copy_fields", + "field_to_field": { + "conversations/0/value": "inputs", + "conversations/0/tok_len": "inputs_len", + "conversations/1/value": "answer" + }, + "use_query": true + }, + { + "type": "list_field_values", + "fields": [ + "answer" + ], + "to_field": "answers" + }, + { + "type": "filter_by_condition", + "values": { + "inputs_len": 2048 + }, + "condition": "lt" + }, + { + "type": "execute_expression", + "expression": "re.search(r\"Document:\\s(.*)(\\n\\n|$)\", inputs, re.DOTALL).group(1)", + "imports_list": [ + "re" + ], + "to_field": "context" + }, + { + "type": "execute_expression", + "expression": "re.search(r\"Question:\\s(.*)(\\n\\n|$)\", inputs, re.DOTALL).group(1)", + "imports_list": [ + "re" + ], + "to_field": "question" + }, + { + "type": "add_fields", + "fields": { + "context_type": "document" + } + }, + { + "type": "split_random_mix", + "mix": { + "train": "2k[80%]", + "validation": "2k[10%]", + "test": "2k[10%]" + } + } + ], + "task": "tasks.qa.with_context.extractive", + "templates": { + "type": "templates_list", + "items": [ + { + "type": "input_output_template", + "input_format": "Answer the question based on the information provided in the document given below. The answer should be a single word or a number or a short phrase of few words\nDocument: {context}\nQuestion: {question}\nAnswer: ", + "output_format": "{answers}" + } + ] + } +} diff --git a/src/unitxt/catalog/cards/ffqa_filtered/4k.json b/src/unitxt/catalog/cards/ffqa_filtered/4k.json new file mode 100644 index 000000000..37e243463 --- /dev/null +++ b/src/unitxt/catalog/cards/ffqa_filtered/4k.json @@ -0,0 +1,73 @@ +{ + "type": "task_card", + "loader": { + "type": "load_hf", + "path": "abacusai/WikiQA-Free_Form_QA" + }, + "preprocess_steps": [ + { + "type": "copy_fields", + "field_to_field": { + "conversations/0/value": "inputs", + "conversations/0/tok_len": "inputs_len", + "conversations/1/value": "answer" + }, + "use_query": true + }, + { + "type": "list_field_values", + "fields": [ + "answer" + ], + "to_field": "answers" + }, + { + "type": "filter_by_condition", + "values": { + "inputs_len": 4096 + }, + "condition": "lt" + }, + { + "type": "execute_expression", + "expression": "re.search(r\"Document:\\s(.*)(\\n\\n|$)\", inputs, re.DOTALL).group(1)", + "imports_list": [ + "re" + ], + "to_field": "context" + }, + { + "type": "execute_expression", + "expression": "re.search(r\"Question:\\s(.*)(\\n\\n|$)\", inputs, re.DOTALL).group(1)", + "imports_list": [ + "re" + ], + "to_field": "question" + }, + { + "type": "add_fields", + "fields": { + "context_type": "document" + } + }, + { + "type": "split_random_mix", + "mix": { + "train": "4k[80%]", + "validation": "4k[10%]", + "test": "4k[10%]" + } + } + ], + "task": "tasks.qa.with_context.extractive", + "templates": { + "type": "templates_list", + "items": [ + { + "type": "input_output_template", + "input_format": "Answer the question based on the information provided in the document given below. The answer should be a single word or a number or a short phrase of few words\nDocument: {context}\nQuestion: {question}\nAnswer: ", + "output_format": "{answers}" + } + ] + } +} diff --git a/src/unitxt/catalog/cards/ffqa_filtered/8k.json b/src/unitxt/catalog/cards/ffqa_filtered/8k.json new file mode 100644 index 000000000..de5cdbac3 --- /dev/null +++ b/src/unitxt/catalog/cards/ffqa_filtered/8k.json @@ -0,0 +1,73 @@ +{ + "type": "task_card", + "loader": { + "type": "load_hf", + "path": "abacusai/WikiQA-Free_Form_QA" + }, + "preprocess_steps": [ + { + "type": "copy_fields", + "field_to_field": { + "conversations/0/value": "inputs", + "conversations/0/tok_len": "inputs_len", + "conversations/1/value": "answer" + }, + "use_query": true + }, + { + "type": "list_field_values", + "fields": [ + "answer" + ], + "to_field": "answers" + }, + { + "type": "filter_by_condition", + "values": { + "inputs_len": 8800 + }, + "condition": "lt" + }, + { + "type": "execute_expression", + "expression": "re.search(r\"Document:\\s(.*)(\\n\\n|$)\", inputs, re.DOTALL).group(1)", + "imports_list": [ + "re" + ], + "to_field": "context" + }, + { + "type": "execute_expression", + "expression": "re.search(r\"Question:\\s(.*)(\\n\\n|$)\", inputs, re.DOTALL).group(1)", + "imports_list": [ + "re" + ], + "to_field": "question" + }, + { + "type": "add_fields", + "fields": { + "context_type": "document" + } + }, + { + "type": "split_random_mix", + "mix": { + "train": "8k[80%]", + "validation": "8k[10%]", + "test": "8k[10%]" + } + } + ], + "task": "tasks.qa.with_context.extractive", + "templates": { + "type": "templates_list", + "items": [ + { + "type": "input_output_template", + "input_format": "Answer the question based on the information provided in the document given below. The answer should be a single word or a number or a short phrase of few words\nDocument: {context}\nQuestion: {question}\nAnswer: ", + "output_format": "{answers}" + } + ] + } +}