Skip to content

Commit

Permalink
thai2fit v0.3
Browse files Browse the repository at this point in the history
  • Loading branch information
Ubuntu committed Nov 20, 2018
1 parent 1da39b2 commit 0a8a60d
Show file tree
Hide file tree
Showing 9 changed files with 180 additions and 458 deletions.
1 change: 0 additions & 1 deletion pythainlp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
from pythainlp.collation import collate
from pythainlp.date import now
from pythainlp.transliterate import romanize, transliterate
from pythainlp.sentiment import sentiment
from pythainlp.soundex import soundex
from pythainlp.spell import spell
from pythainlp.tag import pos_tag
Expand Down
51 changes: 0 additions & 51 deletions pythainlp/sentiment/__init__.py

This file was deleted.

Binary file removed pythainlp/sentiment/sentiment.data
Binary file not shown.
71 changes: 0 additions & 71 deletions pythainlp/sentiment/ulmfit_sent.py

This file was deleted.

Binary file removed pythainlp/sentiment/vocabulary.data
Binary file not shown.
97 changes: 97 additions & 0 deletions pythainlp/ulmfit/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,98 @@
# -*- coding: utf-8 -*-

"""
Code by https://github.com/cstorm125/thai2fit/
"""
import re
import numpy as np
import dill as pickle

#fastai
from fastai import *
from fastai.text.transform import *

#pytorch
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#pythainlp
from pythainlp.corpus import download, get_file
from pythainlp.tokenize import word_tokenize
from pythainlp.util import normalize as normalize_char_order

MODEL_NAME = "thai2fit_lm"
ITOS_NAME = "thai2fit_itos"

#custom fastai tokenizer
class ThaiTokenizer(BaseTokenizer):
"""
Wrapper around a frozen newmm tokenizer to make it a fastai `BaseTokenizer`.
"""
def __init__(self, lang:str = 'th'):
self.lang = lang
def tokenizer(self, t:str) -> List[str]:
"""
:meth: tokenize text with a frozen newmm engine
:param str t: text to tokenize
:return: tokenized text
"""
return(word_tokenize(t,engine='ulmfit'))
def add_special_cases(self, toks:Collection[str]):
pass

#special rules for thai
def replace_rep_after(t:str) -> str:
"Replace repetitions at the character level in `t` after the repetition"
def _replace_rep(m:Collection[str]) -> str:
c,cc = m.groups()
return f' {c} {TK_REP} {len(cc)+1} '
re_rep = re.compile(r'(\S)(\1{3,})')
return re_rep.sub(_replace_rep, t)

def rm_useless_newlines(t:str) -> str:
"Remove multiple newlines in `t`."
return re.sub('[\n]{2,}', ' ', t)

def rm_brackets(t:str) -> str:
"Remove all empty brackets from `t`."
new_line = re.sub('\(\)','',t)
new_line = re.sub('\{\}','',new_line)
new_line = re.sub('\[\]','',new_line)
return(new_line)

#in case we want to add more specific rules for thai
thai_rules = [fix_html, deal_caps, replace_rep_after, normalize_char_order,
spec_add_spaces, rm_useless_spaces, rm_useless_newlines, rm_brackets]

# Download pretrained models
def get_path(fname):
"""
:meth: download get path of file from pythainlp-corpus
:param str fname: file name
:return: path to downloaded file
"""
path = get_file(fname)
if not path:
download(fname)
path = get_file(fname)
return(path)

#pretrained paths
THWIKI = [get_path(MODEL_NAME)[:-4], get_path(ITOS_NAME)[:-4]]
tt = ThaiTokenizer()

def document_vector(ss, learn, data):
"""
:meth: `document_vector` get document vector using pretrained ULMFiT model
:param str ss: sentence to extract embeddings
:param learn: fastai language model learner
:param data: fastai data bunch
:return: `numpy.array` of document vector sized 400
"""
s = tt.tokenizer(ss)
t = torch.tensor(data.vocab.numericalize(s), requires_grad=False)[:,None].to(device)
m = learn.model[0]
m.reset()
pred,_ = m(t)
res = pred[-1][-1,:,:].squeeze().detach().numpy()
return(res)
Loading

0 comments on commit 0a8a60d

Please sign in to comment.