jiesutd · kaushikacharya · Jun 30, 2022
diff --git a/utils/data.py b/utils/data.py
@@ -5,6 +5,8 @@
 # @Last Modified time: 2019-01-25 20:25:59
 from __future__ import print_function
 from __future__ import absolute_import
+import io
+import os
 import sys
 from .alphabet import Alphabet
 from .functions import *
@@ -212,15 +214,14 @@ def initial_feature_alphabets(self):
 
 
     def build_alphabet(self, input_file):
-        in_lines = open(input_file,'r').readlines()
+        in_lines = io.open(input_file, mode='r', encoding='utf-8').readlines()
         for line in in_lines:
+            line = line.strip()
             if len(line) > 2:
                 ## if sentence classification data format, splited by \t
                 if self.sentence_classification:
-                    pairs = line.strip().split(self.split_token)
+                    pairs = line.split(self.split_token)
                     sent = pairs[0]
-                    if sys.version_info[0] < 3:
-                        sent = sent.decode('utf-8')
                     words = sent.split()
                     for word in words:
                         if self.number_normalized:
@@ -237,10 +238,8 @@ def build_alphabet(self, input_file):
 
                 ## if sequence labeling data format i.e. CoNLL 2003
                 else:
-                    pairs = line.strip().split()
+                    pairs = line.split()
                     word = pairs[0]
-                    if sys.version_info[0] < 3:
-                        word = word.decode('utf-8')
                     if self.number_normalized:
                         word = normalize_word(word)
                     label = pairs[-1]
@@ -313,7 +312,7 @@ def write_decoded_results(self, predict_results, name):
         sent_num = len(predict_results)
         content_list = []
         if name == 'raw':
-           content_list = self.raw_texts
+            content_list = self.raw_texts
         elif name == 'test':
             content_list = self.test_texts
         elif name == 'dev':
@@ -344,6 +343,9 @@ def load(self,data_file):
         self.__dict__.update(tmp_dict)
 
     def save(self,save_file):
+        save_dir = os.path.dirname(save_file)
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir)
         f = open(save_file, 'wb')
         pickle.dump(self.__dict__, f, 2)
         f.close()
@@ -353,7 +355,9 @@ def save(self,save_file):
     def write_nbest_decoded_results(self, predict_results, pred_scores, name):
         ## predict_results : [whole_sent_num, nbest, each_sent_length]
         ## pred_scores: [whole_sent_num, nbest]
-        fout = open(self.decode_dir,'w')
+        if not os.path.exists(os.path.dirname(self.decode_dir)):
+            os.makedirs(os.path.dirname(self.decode_dir))
+        fout = open(self.decode_dir, 'w', encoding="utf-8")
         sent_num = len(predict_results)
         content_list = []
         if name == 'raw':

diff --git a/utils/functions.py b/utils/functions.py
@@ -5,6 +5,7 @@
 # @Last Modified time: 2019-02-14 12:23:52
 from __future__ import print_function
 from __future__ import absolute_import
+import io
 import sys
 import numpy as np
 
@@ -20,7 +21,7 @@ def normalize_word(word):
 
 def read_instance(input_file, word_alphabet, char_alphabet, feature_alphabets, label_alphabet, number_normalized, max_sent_length, sentence_classification=False, split_token='\t', char_padding_size=-1, char_padding_symbol = '</pad>'):
     feature_num = len(feature_alphabets)
-    in_lines = open(input_file,'r', encoding="utf8").readlines()
+    in_lines = io.open(input_file, 'r', encoding="utf8").readlines()
     instence_texts = []
     instence_Ids = []
     words = []
@@ -35,11 +36,10 @@ def read_instance(input_file, word_alphabet, char_alphabet, feature_alphabets, l
     ## if sentence classification data format, splited by \t
     if sentence_classification:
         for line in in_lines:
+            line = line.strip()
             if len(line) > 2:
-                pairs = line.strip().split(split_token)
+                pairs = line.split(split_token)
                 sent = pairs[0]
-                if sys.version_info[0] < 3:
-                    sent = sent.decode('utf-8')
                 original_words = sent.split()
                 for word in original_words:
                     words.append(word)
@@ -95,11 +95,10 @@ def read_instance(input_file, word_alphabet, char_alphabet, feature_alphabets, l
     else:
     ### for sequence labeling data format i.e. CoNLL 2003
         for line in in_lines:
+            line = line.strip()
             if len(line) > 2:
-                pairs = line.strip().split()
+                pairs = line.split()
                 word = pairs[0]
-                if sys.version_info[0] < 3:
-                    word = word.decode('utf-8')
                 words.append(word)
                 if number_normalized:
                     word = normalize_word(word)
@@ -196,7 +195,7 @@ def norm2one(vec):
 def load_pretrain_emb(embedding_path):
     embedd_dim = -1
     embedd_dict = dict()
-    with open(embedding_path, 'r', encoding="utf8") as file:
+    with io.open(embedding_path, 'r', encoding="utf8") as file:
         for line in file:
             line = line.strip()
             if len(line) == 0:
@@ -210,10 +209,7 @@ def load_pretrain_emb(embedding_path):
                 # assert (embedd_dim + 1 == len(tokens))
             embedd = np.empty([1, embedd_dim])
             embedd[:] = tokens[1:]
-            if sys.version_info[0] < 3:
-                first_col = tokens[0].decode('utf-8')
-            else:
-                first_col = tokens[0]
+            first_col = tokens[0]
             embedd_dict[first_col] = embedd
     return embedd_dict, embedd_dim