From 7ed3d7a83823c95ed35c59f82ea2669f04e0e5d7 Mon Sep 17 00:00:00 2001 From: WonJin Yoon Date: Tue, 14 May 2019 19:52:45 +0900 Subject: [PATCH] QA dataset replaced and transform_nbset2bioasqform changed. --- README.md | 4 +- biocodes/transform_nbset2bioasqform.py | 83 ++++++++++++++++++++------ 2 files changed, 68 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 88e7700..8ac157d 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ All the fine-tuning experiments were conducted on a single TITAN Xp GPU machine We provide pre-processed version of benchmark datasets for each task as follows: * **[`Named Entity Recognition`](https://drive.google.com/open?id=1OletxmPYNkz2ltOr9pyT0b0iBtUWxslh)**: (17.3 MB), 8 datasets on biomedical named entity recognition * **[`Relation Extraction`](https://drive.google.com/open?id=1-jDKGcXREb2X9xTFnuiJ36PvsqoyHWcw)**: (2.5 MB), 2 datasets on biomedical relation extraction -* **[`Question Answering`](https://drive.google.com/open?id=1R2aTOdvGlce95OVQLXnJ_pdm3i6UmVOw)**: (1.10 MB), 2 datasets on biomedical question answering task. +* **[`Question Answering`](https://drive.google.com/open?id=19ft5q44W4SuptJgTwR84xZjsHg1jvjSZ)**: (5.23 MB), 3 datasets on biomedical question answering task. For details on NER datasets, please see **A Neural Network Multi-Task Learning Approach to Biomedical Named Entity Recognition (Crichton et al. 2017)**. The source of pre-processed datasets are from https://github.com/cambridgeltl/MTL-Bioinformatics-2016 and https://github.com/spyysalo/s800. @@ -118,7 +118,7 @@ precision : 75.87% Please be aware that you have to move `output_dir` to make new model. As some RE datasets are 10-fold divided, you have to make different output directories to train a model with different datasets. ### Question Answering (QA) -To download QA datasets, you should register in [BioASQ website](http://participants-area.bioasq.org). After the registration, download **[`BioASQ Task B`](http://participants-area.bioasq.org/Tasks/A/getData/)** data, and unpack it to some directory `$BIOASQ_DIR`. Finally, download **[`Question Answering`](https://drive.google.com/open?id=1R2aTOdvGlce95OVQLXnJ_pdm3i6UmVOw)**, our pre-processed version of BioASQ-4/5b datasets, and unpack it to `$BIOASQ_DIR`. +To download QA datasets, you should register in [BioASQ website](http://participants-area.bioasq.org). After the registration, download **[`BioASQ Task B`](http://participants-area.bioasq.org/Tasks/A/getData/)** data, and unpack it to some directory `$BIOASQ_DIR`. Finally, download **[`Question Answering`](https://drive.google.com/open?id=19ft5q44W4SuptJgTwR84xZjsHg1jvjSZ)**, our pre-processed version of BioASQ-4/5/6b datasets, and unpack it to `$BIOASQ_DIR`. Please use `BioASQ-*.json` for training and testing the model. This is necessary as the input data format of BioBERT is different from BioASQ dataset format. Also, please be informed that the do_lower_case flag should be set as `--do_lower_case=False`. Following command runs fine-tuining code on QA with default arguments. ``` diff --git a/biocodes/transform_nbset2bioasqform.py b/biocodes/transform_nbset2bioasqform.py index 386bb13..e76453d 100644 --- a/biocodes/transform_nbset2bioasqform.py +++ b/biocodes/transform_nbset2bioasqform.py @@ -3,13 +3,14 @@ import pandas as pd import os, subprocess import argparse +from collections import OrderedDict +import operator parser = argparse.ArgumentParser(description='Shape the answer') parser.add_argument('--nbest_path', type=str, help='location of nbest_predictions.json') parser.add_argument('--output_path', type=str, help='location of nbest_predictions.json') args = parser.parse_args() - ### Setting basic strings #### Info : This script is only for factoid question @@ -21,24 +22,72 @@ #### Reading Pred File with open(args.nbest_path, "r") as reader: - test=json.load(reader) + test = json.load(reader) + +qidDict = dict() +for multiQid in test: # Supports Multi-qid + assert len(multiQid) == (24+4) # Please use the lateset version of QA datasets. All multiQids should have length of 24 + 4 (3 for Sub id) + if not multiQid[:-4] in qidDict: + qidDict[multiQid[:-4]] = [test[multiQid]] + else : + qidDict[multiQid[:-4]].append(test[multiQid]) + + +entryList = [] +entryListWithProb = [] + +for qid in qidDict: + + jsonList = [] + for jsonele in qidDict[qid]: + jsonList += jsonele -entryList=[] -for qid in test: - ansList=[] # plain list - qidDf=pd.DataFrame().from_dict(test[qid]) - ansList=qidDf.sort_values(by='probability', axis=0, ascending=False)['text'][:5].tolist() + qidDf = pd.DataFrame().from_dict(jsonList) - entry={u"type":"factoid", - #u"body":qas, - u"id":qid, # must be 24 char - u"ideal_answer":["Dummy"], - u"exact_answer":[[ans] for ans in ansList if ans != " "], - # I think enough? - } + sortedDf = qidDf.sort_values(by='probability', axis=0, ascending=False) + + sortedSumDict = OrderedDict() + sortedSumDictKeyDict = dict() + + + for index in sortedDf.index: + text = sortedDf.ix[index]["text"] + if text == "": + pass + elif len(text) > 100: + pass + elif text.lower() in sortedSumDictKeyDict: + sortedSumDict[sortedSumDictKeyDict[text.lower()]] += sortedDf.ix[index]["probability"] + else: + sortedSumDictKeyDict[text.lower()] = text + sortedSumDict[sortedSumDictKeyDict[text.lower()]] = sortedDf.ix[index]["probability"] + finalSorted = sorted(sortedSumDict.items(), key=operator.itemgetter(1), reverse=True) + + + entry = {u"type":"factoid", + u"id":qid, # must be 24 chars + u"ideal_answer":"Dummy", + u"exact_answer":[[ans[0]] for ans in finalSorted[:5]], + } entryList.append(entry) -finalformat={u'questions':entryList} + + entryWithProb = {u"type":"factoid", + u"id":qid, # must be 24 chars + u"ideal_answer":"Dummy", + u"exact_answer":[ans for ans in finalSorted[:20]], + } + entryListWithProb.append(entryWithProb) +finalformat = {u'questions':entryList} +finalformatWithProb = {u'questions':entryListWithProb} -with open(args.output_path+"/BioASQform_BioASQ-answer.json", "w") as outfile: - json.dump(finalformat,outfile) +if os.path.isdir(args.output_path): + outfilepath = os.path.join(args.output_path, "BioASQform_BioASQ-answer.json") + outWithProbfilepath = os.path.join(args.output_path, "WithProb_BioASQform_BioASQ-answer.json") +else: + outfilepath = args.output_path + outWithProbfilepath = args.output_path+"_WithProb" +with open(outfilepath, "w") as outfile: + json.dump(finalformat, outfile, indent=2) +with open(outWithProbfilepath, "w") as outfile_prob: + json.dump(finalformatWithProb, outfile_prob, indent=2)