thai2fit v0.3

PyThaiNLP · Nov 20, 2018 · 0a8a60d · 0a8a60d
1 parent 1da39b2
commit 0a8a60d
Show file tree

Hide file tree

Showing 9 changed files with 180 additions and 458 deletions.
diff --git a/pythainlp/__init__.py b/pythainlp/__init__.py
@@ -27,7 +27,6 @@
 from pythainlp.collation import collate
 from pythainlp.date import now
 from pythainlp.transliterate import romanize, transliterate
-from pythainlp.sentiment import sentiment
 from pythainlp.soundex import soundex
 from pythainlp.spell import spell
 from pythainlp.tag import pos_tag

diff --git a/pythainlp/sentiment/__init__.py b/pythainlp/sentiment/__init__.py
diff --git a/pythainlp/sentiment/sentiment.data b/pythainlp/sentiment/sentiment.data
diff --git a/pythainlp/sentiment/ulmfit_sent.py b/pythainlp/sentiment/ulmfit_sent.py
diff --git a/pythainlp/sentiment/vocabulary.data b/pythainlp/sentiment/vocabulary.data
diff --git a/pythainlp/ulmfit/__init__.py b/pythainlp/ulmfit/__init__.py
@@ -1 +1,98 @@
 # -*- coding: utf-8 -*-
+
+"""
+Code by https://github.com/cstorm125/thai2fit/
+"""
+import re
+import numpy as np
+import dill as pickle
+
+#fastai
+from fastai import *    
+from fastai.text.transform import *
+
+#pytorch
+import torch
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+#pythainlp
+from pythainlp.corpus import download, get_file
+from pythainlp.tokenize import word_tokenize
+from pythainlp.util import normalize as normalize_char_order
+
+MODEL_NAME = "thai2fit_lm"
+ITOS_NAME = "thai2fit_itos"
+
+#custom fastai tokenizer
+class ThaiTokenizer(BaseTokenizer):
+    """
+    Wrapper around a frozen newmm tokenizer to make it a fastai `BaseTokenizer`.
+    """
+    def __init__(self, lang:str = 'th'):
+        self.lang = lang
+    def tokenizer(self, t:str) -> List[str]:
+        """
+        :meth: tokenize text with a frozen newmm engine
+        :param str t: text to tokenize
+        :return: tokenized text
+        """
+        return(word_tokenize(t,engine='ulmfit'))
+    def add_special_cases(self, toks:Collection[str]):
+        pass
+
+#special rules for thai
+def replace_rep_after(t:str) -> str:
+    "Replace repetitions at the character level in `t` after the repetition"
+    def _replace_rep(m:Collection[str]) -> str:
+        c,cc = m.groups()
+        return f' {c} {TK_REP} {len(cc)+1} '
+    re_rep = re.compile(r'(\S)(\1{3,})')
+    return re_rep.sub(_replace_rep, t)
+
+def rm_useless_newlines(t:str) -> str:
+    "Remove multiple newlines in `t`."
+    return re.sub('[\n]{2,}', ' ', t)
+
+def rm_brackets(t:str) -> str:
+    "Remove all empty brackets from `t`."
+    new_line = re.sub('\(\)','',t)
+    new_line = re.sub('\{\}','',new_line)
+    new_line = re.sub('\[\]','',new_line)
+    return(new_line)
+
+#in case we want to add more specific rules for thai
+thai_rules = [fix_html, deal_caps, replace_rep_after, normalize_char_order, 
+              spec_add_spaces, rm_useless_spaces, rm_useless_newlines, rm_brackets]
+
+# Download pretrained models
+def get_path(fname):
+    """
+    :meth: download get path of file from pythainlp-corpus
+    :param str fname: file name
+    :return: path to downloaded file
+    """
+    path = get_file(fname)
+    if not path:
+        download(fname)
+        path = get_file(fname)
+    return(path)
+
+#pretrained paths
+THWIKI = [get_path(MODEL_NAME)[:-4], get_path(ITOS_NAME)[:-4]]
+tt = ThaiTokenizer()
+
+def document_vector(ss, learn, data):
+    """
+    :meth: `document_vector` get document vector using pretrained ULMFiT model
+    :param str ss: sentence to extract embeddings
+    :param learn: fastai language model learner
+    :param data: fastai data bunch
+    :return: `numpy.array` of document vector sized 400
+    """
+    s = tt.tokenizer(ss)
+    t = torch.tensor(data.vocab.numericalize(s), requires_grad=False)[:,None].to(device)
+    m = learn.model[0]
+    m.reset()
+    pred,_ = m(t)
+    res = pred[-1][-1,:,:].squeeze().detach().numpy()
+    return(res)