Skip to content

Commit

Permalink
Add filtered ffqa dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
marukaz authored and elronbandel committed Feb 26, 2024
1 parent 7b50122 commit 10cd1dc
Show file tree
Hide file tree
Showing 5 changed files with 429 additions and 0 deletions.
137 changes: 137 additions & 0 deletions prepare/cards/ffqa_filtered.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
from src.unitxt.blocks import (
InputOutputTemplate,
LoadHF,
SplitRandomMix,
TaskCard,
TemplatesList,
)
from src.unitxt.catalog import add_to_catalog
from src.unitxt.operators import (
AddFields,
CopyFields,
ExecuteExpression,
FilterByCondition,
ListFieldValues,
)
from src.unitxt.test_utils.card import test_card

"""Filtered version of the WikiQA-Free_Form_QA dataset.
If you would like to use the full dataset, please copy and modify this card as ffqa.py.
"""

# Dataset structure:
# DatasetDict({
# 2k: Dataset({
# features: ['conversations'],
# num_rows: 600
# })
# 4k: Dataset({
# features: ['conversations'],
# num_rows: 600
# })
# 8k: Dataset({
# features: ['conversations'],
# num_rows: 600
# })
# 16k: Dataset({
# features: ['conversations'],
# num_rows: 600
# })
# })
#
# conversations: [
# {
# "from": "human",
# "tok_len": int,
# "value": str,
# },
# {
# "from": "agent",
# "tok_len": int,
# "value": str,
# },
# ]
#
# value (There is also a pattern Document and Question are reversed):
# Answer the question based on the information provided in the document given below. The answer should be a single word or a number or a short phrase of few words
#
# Document:
# <document>
#
# Question:
# <question>
#


# TODO: Remove duplicate data


# Some of the data is longer than the name of the split.
# For example, it may be 9000 tokens even if it is included in an 8k split.
# Set an upper limit to exclude sentences that are too long.
# NOTE: Only 8k split has been adjusted for now.
token_limit_map = {
"2k": 2048,
"4k": 4096,
"8k": 8800,
"16k": 16384,
}


def add_card(split: str):
card = TaskCard(
loader=LoadHF(path="abacusai/WikiQA-Free_Form_QA"),
preprocess_steps=[
CopyFields(
field_to_field={
"conversations/0/value": "inputs",
"conversations/0/tok_len": "inputs_len",
"conversations/1/value": "answer",
},
use_query=True,
),
ListFieldValues(fields=["answer"], to_field="answers"),
FilterByCondition(
values={"inputs_len": token_limit_map[split]},
condition="lt",
),
ExecuteExpression(
expression='re.search(r"Document:\\s(.*)(\\n\\n|$)", inputs, re.DOTALL).group(1)',
imports_list=["re"],
to_field="context",
),
ExecuteExpression(
expression='re.search(r"Question:\\s(.*)(\\n\\n|$)", inputs, re.DOTALL).group(1)',
imports_list=["re"],
to_field="question",
),
AddFields({"context_type": "document"}),
SplitRandomMix(
{
"train": f"{split}[80%]",
"validation": f"{split}[10%]",
"test": f"{split}[10%]",
}
),
],
task="tasks.qa.with_context.extractive",
templates=TemplatesList(
[
InputOutputTemplate(
input_format="""\
Answer the question based on the information provided in the document given below. The answer should be a single word or a number or a short phrase of few words
Document: {context}
Question: {question}
Answer: """,
output_format="{answers}",
),
]
),
)

test_card(card)
add_to_catalog(card, f"cards.ffqa_filtered.{split}")


for split in ["2k", "4k", "8k", "16k"]:
add_card(split)
73 changes: 73 additions & 0 deletions src/unitxt/catalog/cards/ffqa_filtered/16k.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
{
"type": "task_card",
"loader": {
"type": "load_hf",
"path": "abacusai/WikiQA-Free_Form_QA"
},
"preprocess_steps": [
{
"type": "copy_fields",
"field_to_field": {
"conversations/0/value": "inputs",
"conversations/0/tok_len": "inputs_len",
"conversations/1/value": "answer"
},
"use_query": true
},
{
"type": "list_field_values",
"fields": [
"answer"
],
"to_field": "answers"
},
{
"type": "filter_by_condition",
"values": {
"inputs_len": 16384
},
"condition": "lt"
},
{
"type": "execute_expression",
"expression": "re.search(r\"Document:\\s(.*)(\\n\\n|$)\", inputs, re.DOTALL).group(1)",
"imports_list": [
"re"
],
"to_field": "context"
},
{
"type": "execute_expression",
"expression": "re.search(r\"Question:\\s(.*)(\\n\\n|$)\", inputs, re.DOTALL).group(1)",
"imports_list": [
"re"
],
"to_field": "question"
},
{
"type": "add_fields",
"fields": {
"context_type": "document"
}
},
{
"type": "split_random_mix",
"mix": {
"train": "16k[80%]",
"validation": "16k[10%]",
"test": "16k[10%]"
}
}
],
"task": "tasks.qa.with_context.extractive",
"templates": {
"type": "templates_list",
"items": [
{
"type": "input_output_template",
"input_format": "Answer the question based on the information provided in the document given below. The answer should be a single word or a number or a short phrase of few words\nDocument: {context}\nQuestion: {question}\nAnswer: ",
"output_format": "{answers}"
}
]
}
}
73 changes: 73 additions & 0 deletions src/unitxt/catalog/cards/ffqa_filtered/2k.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
{
"type": "task_card",
"loader": {
"type": "load_hf",
"path": "abacusai/WikiQA-Free_Form_QA"
},
"preprocess_steps": [
{
"type": "copy_fields",
"field_to_field": {
"conversations/0/value": "inputs",
"conversations/0/tok_len": "inputs_len",
"conversations/1/value": "answer"
},
"use_query": true
},
{
"type": "list_field_values",
"fields": [
"answer"
],
"to_field": "answers"
},
{
"type": "filter_by_condition",
"values": {
"inputs_len": 2048
},
"condition": "lt"
},
{
"type": "execute_expression",
"expression": "re.search(r\"Document:\\s(.*)(\\n\\n|$)\", inputs, re.DOTALL).group(1)",
"imports_list": [
"re"
],
"to_field": "context"
},
{
"type": "execute_expression",
"expression": "re.search(r\"Question:\\s(.*)(\\n\\n|$)\", inputs, re.DOTALL).group(1)",
"imports_list": [
"re"
],
"to_field": "question"
},
{
"type": "add_fields",
"fields": {
"context_type": "document"
}
},
{
"type": "split_random_mix",
"mix": {
"train": "2k[80%]",
"validation": "2k[10%]",
"test": "2k[10%]"
}
}
],
"task": "tasks.qa.with_context.extractive",
"templates": {
"type": "templates_list",
"items": [
{
"type": "input_output_template",
"input_format": "Answer the question based on the information provided in the document given below. The answer should be a single word or a number or a short phrase of few words\nDocument: {context}\nQuestion: {question}\nAnswer: ",
"output_format": "{answers}"
}
]
}
}
73 changes: 73 additions & 0 deletions src/unitxt/catalog/cards/ffqa_filtered/4k.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
{
"type": "task_card",
"loader": {
"type": "load_hf",
"path": "abacusai/WikiQA-Free_Form_QA"
},
"preprocess_steps": [
{
"type": "copy_fields",
"field_to_field": {
"conversations/0/value": "inputs",
"conversations/0/tok_len": "inputs_len",
"conversations/1/value": "answer"
},
"use_query": true
},
{
"type": "list_field_values",
"fields": [
"answer"
],
"to_field": "answers"
},
{
"type": "filter_by_condition",
"values": {
"inputs_len": 4096
},
"condition": "lt"
},
{
"type": "execute_expression",
"expression": "re.search(r\"Document:\\s(.*)(\\n\\n|$)\", inputs, re.DOTALL).group(1)",
"imports_list": [
"re"
],
"to_field": "context"
},
{
"type": "execute_expression",
"expression": "re.search(r\"Question:\\s(.*)(\\n\\n|$)\", inputs, re.DOTALL).group(1)",
"imports_list": [
"re"
],
"to_field": "question"
},
{
"type": "add_fields",
"fields": {
"context_type": "document"
}
},
{
"type": "split_random_mix",
"mix": {
"train": "4k[80%]",
"validation": "4k[10%]",
"test": "4k[10%]"
}
}
],
"task": "tasks.qa.with_context.extractive",
"templates": {
"type": "templates_list",
"items": [
{
"type": "input_output_template",
"input_format": "Answer the question based on the information provided in the document given below. The answer should be a single word or a number or a short phrase of few words\nDocument: {context}\nQuestion: {question}\nAnswer: ",
"output_format": "{answers}"
}
]
}
}
Loading

0 comments on commit 10cd1dc

Please sign in to comment.