Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Minor bug fixes + add test cases + update readme #147

Merged
merged 10 commits into from
Nov 4, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README-pypi.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ PyThaiNLP features include Thai word and subword segmentations, soundex, romaniz
- thai2vec v0.2 - larger vocab, benchmarking results on Wongnai dataset
- Sentiment classifier based on ULMFit and various product review datasets
- Add ULMFit utility to PyThaiNLP
- Add Thai romanization model thai2rom
- Add Thai romanization model ThaiTransliterator
- Retrain POS-tagging model
- Improved word_tokenize (newmm, mm) and dict_word_tokenize
- Documentation added
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

Thai Natural Language Processing in Python.

PyThaiNLP is a Python package for text processing and linguistic analysis, similar to `nltk`, but with focus on Thai language.
PyThaiNLP is a Python package for text processing and linguistic analysis, similar to `nltk` but with focus on Thai language.

PyThaiNLP supports Python 3.4+. Since version 1.7, PyThaiNLP deprecates its support for Python 2. Python 2 users can still use PyThaiNLP 1.6.

Expand Down Expand Up @@ -44,7 +44,7 @@ Development release
$ pip install https://github.com/PyThaiNLP/pythainlp/archive/dev.zip
```

Note: PyTorch is required for ulmfit sentiment analyser. ```pip install torch``` is needed for the feature.
Note: PyTorch is required for ulmfit sentiment analyser. ```pip install torch``` is needed for the feature. gensim and keras packages may also needed for other modules that rely on these machine learning libraries.

## Documentation

Expand Down Expand Up @@ -103,7 +103,7 @@ $ pip install pythainlp
$ pip install https://github.com/PyThaiNLP/pythainlp/archive/dev.zip
```

หมายเหตุ: เนื่องจาก ulmfit sentiment analyser ต้องใช้ PyTorch จึงต้อง ```pip install torch``` เพื่อติดตั้ง PyTorhc ก่อน
หมายเหตุ: เนื่องจาก ulmfit sentiment analyser ต้องใช้ PyTorch จึงต้อง ```pip install torch``` เพื่อติดตั้ง PyTorhc ก่อน มอดูลที่อาศัยการเรียนรู้ของเครื่องอื่นๆ อาจจำเป็นต้องติดตั้ง gensim และ keras ก่อนเช่นกัน

## เอกสารการใช้งาน

Expand Down
2 changes: 1 addition & 1 deletion pythainlp/corpus/ttc.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def word_freqs():
ดึงข้อมูลความถี่คำของ Thai Textbook Corpus (TTC) มาใช้งาน
โดยมีรูปแบบข้อมูลเป็น List[Tuple] [(word, frequency), ...]
"""
path = get_full_data_path("tnc_freq.txt") # try local copy first
path = get_full_data_path("ttc_freq.txt") # try local copy first
if not os.path.exists(path): # if fail, download from internet
response = requests.get(_TCC_FREQ_URL)
with open(path, "wb") as f:
Expand Down
113 changes: 64 additions & 49 deletions pythainlp/number/thainum.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,30 +90,38 @@ def bahttext(amount_number):
"""
Converts a number to Thai text and adds a suffix of "Baht" currency.

Similar to BAHTTEXT funcation in Excel
Similar to BAHTTEXT function in Excel
"""
amount_number = number_format(amount_number, 2).replace(" ", "")
pt = amount_number.find(".")
number, fraction = "", ""
amount_number1 = amount_number.split(".")
ret = ""

if not pt:
number = amount_number
if amount_number is None:
pass
elif amount_number == 0:
ret = "ศูนย์บาทถ้วน"
else:
amount_number = amount_number.split(".")
number = amount_number[0]
fraction = int(amount_number1[1])
amount_number = number_format(amount_number, 2).replace(" ", "")
pt = amount_number.find(".")
number, fraction = "", ""
amount_number1 = amount_number.split(".")

ret = ""
number = ast.literal_eval(number.replace(",", ""))
baht = num_to_thaiword(number)
if baht != "":
ret = "".join([ret, baht, "บาท"])
satang = num_to_thaiword(fraction)
if satang != "":
ret = "".join([ret, satang, "สตางค์"])
else:
ret = "".join([ret, "ถ้วน"])
if not pt:
number = amount_number
else:
amount_number = amount_number.split(".")
number = amount_number[0]
fraction = int(amount_number1[1])

number = ast.literal_eval(number.replace(",", ""))

baht = num_to_thaiword(number)
if baht != "":
ret = "".join([ret, baht, "บาท"])

satang = num_to_thaiword(fraction)
if satang != "" and satang != "ศูนย์":
ret = "".join([ret, satang, "สตางค์"])
else:
ret = "".join([ret, "ถ้วน"])

return ret

Expand All @@ -123,38 +131,45 @@ def num_to_thaiword(number):
:param float number: a float number (with decimals) indicating a quantity
:return: a text that indicates the full amount in word form, properly ending each digit with the right term.
"""
position_call = ["แสน", "หมื่น", "พัน", "ร้อย", "สิบ", ""]
number_call = ["", "หนึ่ง", "สอง", "สาม", "สี่", "ห้า", "หก", "เจ็ด", "แปด", "เก้า"]

ret = ""
if number == 0:
return ret
if number > 1000000:
ret += num_to_thaiword(int(number / 1000000)) + "ล้าน"
number = int(math.fmod(number, 1000000))
divider = 100000

pos = 0
while number > 0:
d = int(number / divider)
if (divider == 10) and (d == 2):
ret += "ยี่"
elif (divider == 10) and (d == 1):
ret += ""
elif (divider == 1) and (d == 1) and (ret != ""):
ret += "เอ็ด"
else:
ret += number_call[d]
if d:
ret += position_call[pos]
else:
ret += ""
number = number % divider
divider = divider / 10
pos += 1

if number is None:
pass
elif number == 0:
ret = "ศูนย์"
else:
_POS_CALL = ["แสน", "หมื่น", "พัน", "ร้อย", "สิบ", ""]
_NUM_CALL = ["", "หนึ่ง", "สอง", "สาม", "สี่", "ห้า", "หก", "เจ็ด", "แปด", "เก้า"]

if number > 1000000:
ret += num_to_thaiword(int(number / 1000000)) + "ล้าน"
number = int(math.fmod(number, 1000000))
divider = 100000

pos = 0
while number > 0:
d = int(number / divider)

if (divider == 10) and (d == 2):
ret += "ยี่"
elif (divider == 10) and (d == 1):
ret += ""
elif (divider == 1) and (d == 1) and (ret != ""):
ret += "เอ็ด"
else:
ret += _NUM_CALL[d]

if d:
ret += _POS_CALL[pos]
else:
ret += ""

number = number % divider
divider = divider / 10
pos += 1

return ret


if __name__ == "__main__":
print(bahtext(4000.0))
print(bahttext(4000.0))
19 changes: 17 additions & 2 deletions pythainlp/number/wordtonum.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
"""
import re

from pythainlp.tokenize import Tokenizer

_THAIWORD_NUMS = set("ศูนย์ หนึ่ง เอ็ด สอง ยี่ สาม สี่ ห้า หก เจ็ด แปด เก้า".split())
_THAIWORD_UNITS = set("สิบ ร้อย พัน หมื่น แสน ล้าน".split())
_THAIWORD_NUMS_UNITS = _THAIWORD_NUMS | _THAIWORD_UNITS
Expand Down Expand Up @@ -34,12 +36,14 @@
_NU_PAT = re.compile("(.+)?(สิบ|ร้อย|พัน|หมื่น|แสน|ล้าน)(.+)?") # หกสิบ, ร้อยเอ็ด
# assuming that the units are separated already

_TOKENIZER = Tokenizer(custom_dict=_THAIWORD_NUMS_UNITS)


def _thaiword_to_num(tokens):
len_tokens = len(tokens)

if len_tokens == 0:
return 0
return None

if len_tokens == 1:
return _THAI_INT_MAP[tokens[0]]
Expand All @@ -61,7 +65,17 @@ def _thaiword_to_num(tokens):
return _THAI_INT_MAP[a] * _THAI_INT_MAP[b] + _thaiword_to_num(tokens[2:])


def thaiword_to_num(tokens):
def thaiword_to_num(thaiword):
if not thaiword:
return None

tokens = []
if type(thaiword) == str:
tokens = _TOKENIZER.word_tokenize(thaiword)
elif type(thaiword) in (list, tuple, set, frozenset):
for w in thaiword:
tokens.extend(_TOKENIZER.word_tokenize(w))

res = []
for tok in tokens:
if tok in _THAIWORD_NUMS_UNITS:
Expand All @@ -72,4 +86,5 @@ def thaiword_to_num(tokens):
res.extend([t for t in m.groups() if t]) # ตัด None ทิ้ง
else:
pass # should not be here

return _thaiword_to_num(res)
2 changes: 2 additions & 0 deletions pythainlp/sentiment/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,12 @@ def sentiment(text, engine="old"):
os.path.join(_SENTIMENT_PATH, "vocabulary.data"), "rb"
) as in_strm:
vocabulary = dill.load(in_strm)

with open(
os.path.join(_SENTIMENT_PATH, "sentiment.data"), "rb"
) as in_strm:
classifier = dill.load(in_strm)

text = set(word_tokenize(text)) - _STOPWORDS
featurized_test_sentence = {i: (i in text) for i in vocabulary}

Expand Down
2 changes: 0 additions & 2 deletions pythainlp/ulmfit/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from pythainlp.corpus import download, get_file
from pythainlp.tokenize import word_tokenize


try:
import numpy as np
from fastai.text import *
Expand Down Expand Up @@ -43,7 +42,6 @@ def __init__(self, engine="newmm"):
* newmm - dictionary-based, Maximum Matching algorithm + TCC
* longest - dictionary-based, Longest Matching
* icu - use ICU, dictionary-based
* pylexto - use LexTo, dictionary-based
* deepcut - use deepcut, language model-based
"""
self.engine = engine
Expand Down
18 changes: 16 additions & 2 deletions tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
conceptnet,
countries,
provinces,
remove,
thai_negations,
thai_stopwords,
thai_syllables,
Expand Down Expand Up @@ -66,6 +67,7 @@ def test_corpus(self):
self.assertIsNotNone(thai_stopwords())
self.assertIsNotNone(thai_syllables())
self.assertIsNotNone(thai_words())
self.assertIsNotNone(remove("tnc_freq"))

def test_tnc(self):
self.assertIsNotNone(tnc.word_freqs())
Expand Down Expand Up @@ -150,13 +152,25 @@ def test_number(self):
bahttext(5611116.50),
"ห้าล้านหกแสนหนึ่งหมื่นหนึ่งพันหนึ่งร้อยสิบหกบาทห้าสิบสตางค์",
)
self.assertEqual(bahttext(116), "หนึ่งร้อยสิบหกบาทถ้วน")
self.assertEqual(bahttext(0), "ศูนย์บาทถ้วน")
self.assertEqual(bahttext(None), "")

self.assertEqual(num_to_thaiword(112), "หนึ่งร้อยสิบสอง")
self.assertEqual(num_to_thaiword(0), "ศูนย์")
self.assertEqual(num_to_thaiword(None), "")

self.assertEqual(thaiword_to_num("ร้อยสิบสอง"), 112)
self.assertEqual(
thaiword_to_num(
["หก", "ล้าน", "หกแสน", "หกหมื่น", "หกพัน", "หกร้อย", "หกสิบ", "หก"]
["หก", "ล้าน", "หก", "แสน", "หกหมื่น", "หกพัน", "หกร้อย", "หกสิบ", "หก"]
),
6666666,
)
self.assertEqual(thaiword_to_num("ยี่สิบ"), 20)
self.assertEqual(thaiword_to_num("ศูนย์"), 0)
self.assertEqual(thaiword_to_num(""), None)
self.assertEqual(thaiword_to_num(None), None)

# ### pythainlp.rank

Expand All @@ -181,7 +195,7 @@ def test_romanization_royin(self):

def test_sentiment(self):
text = "เสียใจมาก"
# self.assertEqual(sentiment(text, engine="old"), "neg")
self.assertEqual(sentiment(text, engine="old"), "neg")
# self.assertEqual(sentiment(text, engine="ulmfit"), "neg")

# ### pythainlp.soundex
Expand Down