Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add filtered ffqa dataset #593

Merged
merged 3 commits into from
Feb 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 124 additions & 0 deletions prepare/cards/ffqa_filtered.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
from src.unitxt.blocks import (
LoadHF,
SplitRandomMix,
TaskCard,
)
from src.unitxt.catalog import add_to_catalog
from src.unitxt.operators import (
AddFields,
CopyFields,
ExecuteExpression,
FilterByCondition,
ListFieldValues,
)
from src.unitxt.test_utils.card import test_card

"""Filtered version of the WikiQA-Free_Form_QA dataset.
If you would like to use the full dataset, please copy and modify this card as ffqa.py.
"""

# Dataset structure:
# DatasetDict({
# 2k: Dataset({
# features: ['conversations'],
# num_rows: 600
# })
# 4k: Dataset({
# features: ['conversations'],
# num_rows: 600
# })
# 8k: Dataset({
# features: ['conversations'],
# num_rows: 600
# })
# 16k: Dataset({
# features: ['conversations'],
# num_rows: 600
# })
# })
#
# conversations: [
# {
# "from": "human",
# "tok_len": int,
# "value": str,
# },
# {
# "from": "agent",
# "tok_len": int,
# "value": str,
# },
# ]
#
# value (There is also a pattern Document and Question are reversed):
# Answer the question based on the information provided in the document given below. The answer should be a single word or a number or a short phrase of few words
#
# Document:
# <document>
#
# Question:
# <question>
#


# TODO: Remove duplicate data


# Some of the data is longer than the name of the split.
# For example, it may be 9000 tokens even if it is included in an 8k split.
# Set an upper limit to exclude sentences that are too long.
# NOTE: Only 8k split has been adjusted for now.
token_limit_map = {
"2k": 2048,
"4k": 4096,
"8k": 8800,
"16k": 16384,
}


def add_card(split: str):
card = TaskCard(
loader=LoadHF(path="abacusai/WikiQA-Free_Form_QA"),
preprocess_steps=[
CopyFields(
field_to_field={
"conversations/0/value": "inputs",
"conversations/0/tok_len": "inputs_len",
"conversations/1/value": "answer",
},
use_query=True,
),
ListFieldValues(fields=["answer"], to_field="answers"),
FilterByCondition(
values={"inputs_len": token_limit_map[split]},
condition="lt",
),
ExecuteExpression(
expression='re.search(r"Document:\\s(.*)(\\n\\n|$)", inputs, re.DOTALL).group(1)',
imports_list=["re"],
to_field="context",
),
ExecuteExpression(
expression='re.search(r"Question:\\s(.*)(\\n\\n|$)", inputs, re.DOTALL).group(1)',
imports_list=["re"],
to_field="question",
),
AddFields({"context_type": "document"}),
SplitRandomMix(
{
"train": f"{split}[80%]",
"validation": f"{split}[10%]",
"test": f"{split}[10%]",
}
),
],
task="tasks.qa.with_context.extractive",
templates="templates.qa.with_context.all",
)

test_card(card)
add_to_catalog(card, f"cards.ffqa_filtered.{split}", overwrite=True)


for split in ["2k", "4k", "8k", "16k"]:
add_card(split)
16 changes: 16 additions & 0 deletions prepare/templates/qa/with_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,29 @@
overwrite=True,
)

# Template from https://huggingface.co/datasets/abacusai/WikiQA-Free_Form_QA
add_to_catalog(
MultiReferenceTemplate(
input_format="""\
Answer the question based on the information provided in the document given below. The answer should be a single word or a number or a short phrase of few words
Document: {context}
Question: {question}
Answer: """,
references_field="answers",
),
"templates.qa.with_context.ffqa",
overwrite=True,
)


add_to_catalog(
TemplatesList(
[
"templates.qa.with_context.simple",
"templates.qa.with_context.simple2",
"templates.qa.with_context.with_type",
"templates.qa.with_context.question_first",
"templates.qa.with_context.ffqa",
]
),
"templates.qa.with_context.all",
Expand Down
64 changes: 64 additions & 0 deletions src/unitxt/catalog/cards/ffqa_filtered/16k.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
{
"type": "task_card",
"loader": {
"type": "load_hf",
"path": "abacusai/WikiQA-Free_Form_QA"
},
"preprocess_steps": [
{
"type": "copy_fields",
"field_to_field": {
"conversations/0/value": "inputs",
"conversations/0/tok_len": "inputs_len",
"conversations/1/value": "answer"
},
"use_query": true
},
{
"type": "list_field_values",
"fields": [
"answer"
],
"to_field": "answers"
},
{
"type": "filter_by_condition",
"values": {
"inputs_len": 16384
},
"condition": "lt"
},
{
"type": "execute_expression",
"expression": "re.search(r\"Document:\\s(.*)(\\n\\n|$)\", inputs, re.DOTALL).group(1)",
"imports_list": [
"re"
],
"to_field": "context"
},
{
"type": "execute_expression",
"expression": "re.search(r\"Question:\\s(.*)(\\n\\n|$)\", inputs, re.DOTALL).group(1)",
"imports_list": [
"re"
],
"to_field": "question"
},
{
"type": "add_fields",
"fields": {
"context_type": "document"
}
},
{
"type": "split_random_mix",
"mix": {
"train": "16k[80%]",
"validation": "16k[10%]",
"test": "16k[10%]"
}
}
],
"task": "tasks.qa.with_context.extractive",
"templates": "templates.qa.with_context.all"
}
64 changes: 64 additions & 0 deletions src/unitxt/catalog/cards/ffqa_filtered/2k.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
{
"type": "task_card",
"loader": {
"type": "load_hf",
"path": "abacusai/WikiQA-Free_Form_QA"
},
"preprocess_steps": [
{
"type": "copy_fields",
"field_to_field": {
"conversations/0/value": "inputs",
"conversations/0/tok_len": "inputs_len",
"conversations/1/value": "answer"
},
"use_query": true
},
{
"type": "list_field_values",
"fields": [
"answer"
],
"to_field": "answers"
},
{
"type": "filter_by_condition",
"values": {
"inputs_len": 2048
},
"condition": "lt"
},
{
"type": "execute_expression",
"expression": "re.search(r\"Document:\\s(.*)(\\n\\n|$)\", inputs, re.DOTALL).group(1)",
"imports_list": [
"re"
],
"to_field": "context"
},
{
"type": "execute_expression",
"expression": "re.search(r\"Question:\\s(.*)(\\n\\n|$)\", inputs, re.DOTALL).group(1)",
"imports_list": [
"re"
],
"to_field": "question"
},
{
"type": "add_fields",
"fields": {
"context_type": "document"
}
},
{
"type": "split_random_mix",
"mix": {
"train": "2k[80%]",
"validation": "2k[10%]",
"test": "2k[10%]"
}
}
],
"task": "tasks.qa.with_context.extractive",
"templates": "templates.qa.with_context.all"
}
64 changes: 64 additions & 0 deletions src/unitxt/catalog/cards/ffqa_filtered/4k.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
{
"type": "task_card",
"loader": {
"type": "load_hf",
"path": "abacusai/WikiQA-Free_Form_QA"
},
"preprocess_steps": [
{
"type": "copy_fields",
"field_to_field": {
"conversations/0/value": "inputs",
"conversations/0/tok_len": "inputs_len",
"conversations/1/value": "answer"
},
"use_query": true
},
{
"type": "list_field_values",
"fields": [
"answer"
],
"to_field": "answers"
},
{
"type": "filter_by_condition",
"values": {
"inputs_len": 4096
},
"condition": "lt"
},
{
"type": "execute_expression",
"expression": "re.search(r\"Document:\\s(.*)(\\n\\n|$)\", inputs, re.DOTALL).group(1)",
"imports_list": [
"re"
],
"to_field": "context"
},
{
"type": "execute_expression",
"expression": "re.search(r\"Question:\\s(.*)(\\n\\n|$)\", inputs, re.DOTALL).group(1)",
"imports_list": [
"re"
],
"to_field": "question"
},
{
"type": "add_fields",
"fields": {
"context_type": "document"
}
},
{
"type": "split_random_mix",
"mix": {
"train": "4k[80%]",
"validation": "4k[10%]",
"test": "4k[10%]"
}
}
],
"task": "tasks.qa.with_context.extractive",
"templates": "templates.qa.with_context.all"
}
Loading
Loading