From 7ed3d7a83823c95ed35c59f82ea2669f04e0e5d7 Mon Sep 17 00:00:00 2001
From: WonJin Yoon <wonjin.info@gmail.com>
Date: Tue, 14 May 2019 19:52:45 +0900
Subject: [PATCH] QA dataset replaced and transform_nbset2bioasqform changed.

---
 README.md                              |  4 +-
 biocodes/transform_nbset2bioasqform.py | 83 ++++++++++++++++++++------
 2 files changed, 68 insertions(+), 19 deletions(-)

diff --git a/README.md b/README.md
index 88e7700..8ac157d 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ All the fine-tuning experiments were conducted on a single TITAN Xp GPU machine
 We provide pre-processed version of benchmark datasets for each task as follows:
 *   **[`Named Entity Recognition`](https://drive.google.com/open?id=1OletxmPYNkz2ltOr9pyT0b0iBtUWxslh)**: (17.3 MB), 8 datasets on biomedical named entity recognition
 *   **[`Relation Extraction`](https://drive.google.com/open?id=1-jDKGcXREb2X9xTFnuiJ36PvsqoyHWcw)**: (2.5 MB), 2 datasets on biomedical relation extraction
-*   **[`Question  Answering`](https://drive.google.com/open?id=1R2aTOdvGlce95OVQLXnJ_pdm3i6UmVOw)**: (1.10 MB), 2 datasets on biomedical question answering task.
+*   **[`Question Answering`](https://drive.google.com/open?id=19ft5q44W4SuptJgTwR84xZjsHg1jvjSZ)**: (5.23 MB), 3 datasets on biomedical question answering task.
 
 For details on NER datasets, please see **A Neural Network Multi-Task Learning Approach to Biomedical Named Entity Recognition (Crichton et al. 2017)**.
 The source of pre-processed datasets are from https://github.com/cambridgeltl/MTL-Bioinformatics-2016 and https://github.com/spyysalo/s800.
@@ -118,7 +118,7 @@ precision   : 75.87%
 Please be aware that you have to move `output_dir` to make new model. As some RE datasets are 10-fold divided, you have to make different output directories to train a model with different datasets.
 
 ### Question Answering (QA)
-To download QA datasets, you should register in [BioASQ website](http://participants-area.bioasq.org). After the registration, download **[`BioASQ Task B`](http://participants-area.bioasq.org/Tasks/A/getData/)** data, and unpack it to some directory `$BIOASQ_DIR`. Finally, download **[`Question Answering`](https://drive.google.com/open?id=1R2aTOdvGlce95OVQLXnJ_pdm3i6UmVOw)**, our pre-processed version of BioASQ-4/5b datasets, and unpack it to `$BIOASQ_DIR`.
+To download QA datasets, you should register in [BioASQ website](http://participants-area.bioasq.org). After the registration, download **[`BioASQ Task B`](http://participants-area.bioasq.org/Tasks/A/getData/)** data, and unpack it to some directory `$BIOASQ_DIR`. Finally, download **[`Question Answering`](https://drive.google.com/open?id=19ft5q44W4SuptJgTwR84xZjsHg1jvjSZ)**, our pre-processed version of BioASQ-4/5/6b datasets, and unpack it to `$BIOASQ_DIR`.
 
 Please use `BioASQ-*.json` for training and testing the model. This is necessary as the input data format of BioBERT is different from BioASQ dataset format. Also, please be informed that the do_lower_case flag should be set as `--do_lower_case=False`. Following command runs fine-tuining code on QA with default arguments.
 ```
diff --git a/biocodes/transform_nbset2bioasqform.py b/biocodes/transform_nbset2bioasqform.py
index 386bb13..e76453d 100644
--- a/biocodes/transform_nbset2bioasqform.py
+++ b/biocodes/transform_nbset2bioasqform.py
@@ -3,13 +3,14 @@
 import pandas as pd
 import os, subprocess
 import argparse
+from collections import OrderedDict
+import operator
 
 parser = argparse.ArgumentParser(description='Shape the answer')
 parser.add_argument('--nbest_path', type=str,  help='location of nbest_predictions.json')
 parser.add_argument('--output_path', type=str,  help='location of nbest_predictions.json')
 args = parser.parse_args()
 
-    
 ### Setting basic strings 
 #### Info : This script is only for factoid question
 
@@ -21,24 +22,72 @@
 
 #### Reading Pred File
 with open(args.nbest_path, "r") as reader:
-    test=json.load(reader)
+    test = json.load(reader)
+
+qidDict = dict()
+for multiQid in test: # Supports Multi-qid
+    assert len(multiQid) == (24+4) # Please use the lateset version of QA datasets. All multiQids should have length of 24 + 4 (3 for Sub id)
+    if not multiQid[:-4] in qidDict:
+        qidDict[multiQid[:-4]] = [test[multiQid]]
+    else :
+        qidDict[multiQid[:-4]].append(test[multiQid])
+
+
+entryList = []
+entryListWithProb = []
+
+for qid in qidDict:
+
+    jsonList = []
+    for jsonele in qidDict[qid]:
+        jsonList += jsonele
 
-entryList=[]
-for qid in test:
-    ansList=[] # plain list
-    qidDf=pd.DataFrame().from_dict(test[qid])
-    ansList=qidDf.sort_values(by='probability', axis=0, ascending=False)['text'][:5].tolist()
+    qidDf = pd.DataFrame().from_dict(jsonList)
     
-    entry={u"type":"factoid", 
-    #u"body":qas, 
-    u"id":qid, # must be 24 char
-    u"ideal_answer":["Dummy"],
-    u"exact_answer":[[ans] for ans in ansList if ans != " "],
-    # I think enough?
-    }
+    sortedDf = qidDf.sort_values(by='probability', axis=0, ascending=False)
+
+    sortedSumDict = OrderedDict()
+    sortedSumDictKeyDict = dict()
+
+	    
+    for index in sortedDf.index:
+        text = sortedDf.ix[index]["text"]
+        if text == "":
+            pass
+        elif len(text) > 100:
+            pass
+        elif text.lower() in sortedSumDictKeyDict:
+            sortedSumDict[sortedSumDictKeyDict[text.lower()]] += sortedDf.ix[index]["probability"]
+        else:
+            sortedSumDictKeyDict[text.lower()] = text
+            sortedSumDict[sortedSumDictKeyDict[text.lower()]] = sortedDf.ix[index]["probability"]        
+    finalSorted = sorted(sortedSumDict.items(), key=operator.itemgetter(1), reverse=True)
+
+    
+    entry = {u"type":"factoid", 
+        u"id":qid, # must be 24 chars
+        u"ideal_answer":"Dummy",
+        u"exact_answer":[[ans[0]] for ans in finalSorted[:5]],
+        }
     entryList.append(entry)
-finalformat={u'questions':entryList}
+    
+    entryWithProb = {u"type":"factoid", 
+        u"id":qid, # must be 24 chars
+        u"ideal_answer":"Dummy",
+        u"exact_answer":[ans for ans in finalSorted[:20]],
+        }
+    entryListWithProb.append(entryWithProb)
+finalformat = {u'questions':entryList}
+finalformatWithProb = {u'questions':entryListWithProb}
 
-with open(args.output_path+"/BioASQform_BioASQ-answer.json", "w") as outfile:
-    json.dump(finalformat,outfile)
+if os.path.isdir(args.output_path):
+    outfilepath = os.path.join(args.output_path, "BioASQform_BioASQ-answer.json")
+    outWithProbfilepath = os.path.join(args.output_path, "WithProb_BioASQform_BioASQ-answer.json")
+else:
+    outfilepath = args.output_path
+    outWithProbfilepath = args.output_path+"_WithProb"
 
+with open(outfilepath, "w") as outfile:
+    json.dump(finalformat, outfile, indent=2)
+with open(outWithProbfilepath, "w") as outfile_prob:
+    json.dump(finalformatWithProb, outfile_prob, indent=2)