-
Notifications
You must be signed in to change notification settings - Fork 270
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
pythainlp.util.is_native_thai moved to pythainlp.morpheme.is_native_thai
- Loading branch information
1 parent
3d324e3
commit 524759a
Showing
6 changed files
with
173 additions
and
131 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
# -*- coding: utf-8 -*- | ||
# SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project | ||
# SPDX-License-Identifier: Apache-2.0 | ||
""" | ||
Check if a word is a "native Thai word" | ||
Adapted from | ||
https://github.com/wannaphong/open-thai-nlp-document/blob/master/check_thai_word.md | ||
References | ||
- ทีมงานทรูปลูกปัญญา 2015. ลักษณะของคำไทยแท้ http://www.trueplookpanya.com/learning/detail/30589-043067 | ||
- วารุณี บำรุงรส 2010. คำไทยแท้ https://www.gotoknow.org/posts/377619 | ||
""" | ||
import re | ||
|
||
_THANTHAKHAT_CHAR = "\u0e4c" # Thanthakhat (cancellation of sound) | ||
|
||
# Non-native Thai characters | ||
_TH_NON_NATIVE_CHARS = { | ||
"ฆ", | ||
"ณ", | ||
"ฌ", | ||
"ฎ", | ||
"ฏ", | ||
"ฐ", | ||
"ฑ", | ||
"ฒ", | ||
"ธ", | ||
"ศ", | ||
"ษ", | ||
"ฬ", | ||
_THANTHAKHAT_CHAR, | ||
} | ||
|
||
# Native Thai final consonants | ||
_TH_NATIVE_FINALS = {"ก", "ด", "บ", "น", "ง", "ม", "ย", "ว"} | ||
|
||
# Known native Thai words (exceptions) | ||
_TH_NATIVE_WORDS = { | ||
"ฆ่า", | ||
"เฆี่ยน", | ||
"ศึก", | ||
"ศอก", | ||
"เศิก", | ||
"เศร้า", | ||
"ธ", | ||
"ณ", | ||
"ฯพณฯ", | ||
"ใหญ่", | ||
"หญ้า", | ||
"ควาย", | ||
"ความ", | ||
"กริ่งเกรง", | ||
"ผลิ", | ||
} | ||
|
||
# Diphthong prefixes (can start native Thai word) | ||
_TH_PREFIX_DIPHTHONG = {"กะ", "กระ", "ปะ", "ประ"} | ||
|
||
# Thai consonant filter | ||
# O ANG (U+0E2D) is omitted, as it can be considered as vowel | ||
_TH_CONSONANTS_PATTERN = re.compile(r"[ก-ฬฮ]", re.U) | ||
|
||
|
||
def is_native_thai(word: str) -> bool: | ||
""" | ||
Check if a word is an "native Thai word" (Thai: "คำไทยแท้") | ||
This function is based on a simple heuristic algorithm | ||
and cannot be entirely reliable. | ||
:param str word: word | ||
:return: True or False | ||
:rtype: bool | ||
:Example: | ||
English word:: | ||
from pythainlp.util import is_native_thai | ||
is_native_thai("Avocado") | ||
# output: False | ||
Native Thai word:: | ||
is_native_thai("มะม่วง") | ||
# output: True | ||
is_native_thai("ตะวัน") | ||
# output: True | ||
Non-native Thai word:: | ||
is_native_thai("สามารถ") | ||
# output: False | ||
is_native_thai("อิสริยาภรณ์") | ||
# output: False | ||
""" | ||
if not isinstance(word, str) or not word.strip(): | ||
return False | ||
|
||
word = word.strip() | ||
|
||
# Known native Thai words (exceptions) | ||
if word in _TH_NATIVE_WORDS: | ||
return True | ||
|
||
# If a word contains non-Thai chars, it is not a native Thai | ||
if any(ch in word for ch in _TH_NON_NATIVE_CHARS): | ||
return False | ||
|
||
# If it does not contain any Thai consonants -> it cannot be Thai | ||
chs = re.findall(_TH_CONSONANTS_PATTERN, word) | ||
if not chs: | ||
return False | ||
|
||
# If there's only one Thai consonant -> it can be a native Thai | ||
if len(chs) == 1: | ||
return True | ||
|
||
# If a word ends with native final, it can be a native Thai | ||
if word[-1] in _TH_NATIVE_FINALS: | ||
return True | ||
|
||
# Note: This will not work, as it check the whole word, not the prefix. | ||
# Prefix-sensitive tokenization is required in order to be able to check this. | ||
if word in _TH_PREFIX_DIPHTHONG: | ||
return True | ||
|
||
return False |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,129 +1,16 @@ | ||
# -*- coding: utf-8 -*- | ||
# SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project | ||
# SPDX-License-Identifier: Apache-2.0 | ||
""" | ||
Check if a word is a "native Thai word" | ||
Adapted from | ||
https://github.com/wannaphong/open-thai-nlp-document/blob/master/check_thai_word.md | ||
References | ||
- ทีมงานทรูปลูกปัญญา 2015. ลักษณะของคำไทยแท้ http://www.trueplookpanya.com/learning/detail/30589-043067 | ||
- วารุณี บำรุงรส 2010. คำไทยแท้ https://www.gotoknow.org/posts/377619 | ||
""" | ||
import re | ||
|
||
_THANTHAKHAT_CHAR = "\u0e4c" # Thanthakhat (cancellation of sound) | ||
|
||
# Non-native Thai characters | ||
_TH_NON_NATIVE_CHARS = { | ||
"ฆ", | ||
"ณ", | ||
"ฌ", | ||
"ฎ", | ||
"ฏ", | ||
"ฐ", | ||
"ฑ", | ||
"ฒ", | ||
"ธ", | ||
"ศ", | ||
"ษ", | ||
"ฬ", | ||
_THANTHAKHAT_CHAR, | ||
} | ||
|
||
# Native Thai final consonants | ||
_TH_NATIVE_FINALS = {"ก", "ด", "บ", "น", "ง", "ม", "ย", "ว"} | ||
|
||
# Known native Thai words (exceptions) | ||
_TH_NATIVE_WORDS = { | ||
"ฆ่า", | ||
"เฆี่ยน", | ||
"ศึก", | ||
"ศอก", | ||
"เศิก", | ||
"เศร้า", | ||
"ธ", | ||
"ณ", | ||
"ฯพณฯ", | ||
"ใหญ่", | ||
"หญ้า", | ||
"ควาย", | ||
"ความ", | ||
"กริ่งเกรง", | ||
"ผลิ", | ||
} | ||
|
||
# Diphthong prefixes (can start native Thai word) | ||
_TH_PREFIX_DIPHTHONG = {"กะ", "กระ", "ปะ", "ประ"} | ||
|
||
# Thai consonant filter | ||
# O ANG (U+0E2D) is omitted, as it can be considered as vowel | ||
_TH_CONSONANTS_PATTERN = re.compile(r"[ก-ฬฮ]", re.U) | ||
|
||
import warnings | ||
|
||
def is_native_thai(word: str) -> bool: | ||
""" | ||
Check if a word is an "native Thai word" (Thai: "คำไทยแท้") | ||
This function is based on a simple heuristic algorithm | ||
and cannot be entirely reliable. | ||
:param str word: word | ||
:return: True or False | ||
:rtype: bool | ||
:Example: | ||
English word:: | ||
from pythainlp.util import is_native_thai | ||
is_native_thai("Avocado") | ||
# output: False | ||
Native Thai word:: | ||
is_native_thai("มะม่วง") | ||
# output: True | ||
is_native_thai("ตะวัน") | ||
# output: True | ||
Non-native Thai word:: | ||
is_native_thai("สามารถ") | ||
# output: False | ||
is_native_thai("อิสริยาภรณ์") | ||
# output: False | ||
""" | ||
if not isinstance(word, str) or not word.strip(): | ||
return False | ||
|
||
word = word.strip() | ||
|
||
# Known native Thai words (exceptions) | ||
if word in _TH_NATIVE_WORDS: | ||
return True | ||
|
||
# If a word contains non-Thai chars, it is not a native Thai | ||
if any(ch in word for ch in _TH_NON_NATIVE_CHARS): | ||
return False | ||
|
||
# If it does not contain any Thai consonants -> it cannot be Thai | ||
chs = re.findall(_TH_CONSONANTS_PATTERN, word) | ||
if not chs: | ||
return False | ||
|
||
# If there's only one Thai consonant -> it can be a native Thai | ||
if len(chs) == 1: | ||
return True | ||
|
||
# If a word ends with native final, it can be a native Thai | ||
if word[-1] in _TH_NATIVE_FINALS: | ||
return True | ||
|
||
# Note: This will not work, as it check the whole word, not the prefix. | ||
# Prefix-sensitive tokenization is required in order to be able to check this. | ||
if word in _TH_PREFIX_DIPHTHONG: | ||
return True | ||
|
||
return False | ||
warnings.warn( | ||
""" | ||
pythainlp.util.is_native_thai is rename as \ | ||
pythainlp.morpheme.is_native_thai. | ||
This function will remove in PyThaiNLP 5.1. | ||
""" | ||
, DeprecationWarning) | ||
from pythainlp.morpheme import is_native_thai as check | ||
|
||
return check(word) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters