Skip to content

Commit

Permalink
pythainlp.util.is_native_thai moved to pythainlp.morpheme.is_native_thai
Browse files Browse the repository at this point in the history
  • Loading branch information
wannaphong committed Jan 5, 2024
1 parent 3d324e3 commit 524759a
Show file tree
Hide file tree
Showing 6 changed files with 173 additions and 131 deletions.
7 changes: 6 additions & 1 deletion docs/api/morpheme.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,9 @@ pythainlp.morpheme

The `pythainlp.benchmarks` module is collect functions for morpheme analysis, word formation and more for Thai language.

.. autofunction:: nighit
.. autofunction:: nighit

.. autofunction:: is_native_thai
:noindex:

The `is_native_thai` function is a language detection tool that identifies whether text is predominantly in the Thai language or not. It aids in language identification and text categorization tasks.
5 changes: 0 additions & 5 deletions docs/api/util.rst
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,6 @@ Modules

The `ipa_to_rtgs` function focuses on converting International Phonetic Alphabet (IPA) transcriptions into Royal Thai General System of Transcription (RTGS) format. This is valuable for phonetic analysis and pronunciation guides.

.. autofunction:: is_native_thai
:noindex:

The `is_native_thai` function is a language detection tool that identifies whether text is predominantly in the Thai language or not. It aids in language identification and text categorization tasks.

.. autofunction:: isthai
:noindex:

Expand Down
5 changes: 5 additions & 0 deletions pythainlp/morpheme/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,9 @@
"""
PyThaiNLP morpheme
"""
__all__ = [
"nighit",
"is_native_thai"
]
from pythainlp.morpheme.word_formation import nighit
from pythainlp.morpheme.thaiwordcheck import is_native_thai
129 changes: 129 additions & 0 deletions pythainlp/morpheme/thaiwordcheck.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
"""
Check if a word is a "native Thai word"
Adapted from
https://github.com/wannaphong/open-thai-nlp-document/blob/master/check_thai_word.md
References
- ทีมงานทรูปลูกปัญญา 2015. ลักษณะของคำไทยแท้ http://www.trueplookpanya.com/learning/detail/30589-043067
- วารุณี บำรุงรส 2010. คำไทยแท้ https://www.gotoknow.org/posts/377619
"""
import re

_THANTHAKHAT_CHAR = "\u0e4c" # Thanthakhat (cancellation of sound)

# Non-native Thai characters
_TH_NON_NATIVE_CHARS = {
"ฆ",
"ณ",
"ฌ",
"ฎ",
"ฏ",
"ฐ",
"ฑ",
"ฒ",
"ธ",
"ศ",
"ษ",
"ฬ",
_THANTHAKHAT_CHAR,
}

# Native Thai final consonants
_TH_NATIVE_FINALS = {"ก", "ด", "บ", "น", "ง", "ม", "ย", "ว"}

# Known native Thai words (exceptions)
_TH_NATIVE_WORDS = {
"ฆ่า",
"เฆี่ยน",
"ศึก",
"ศอก",
"เศิก",
"เศร้า",
"ธ",
"ณ",
"ฯพณฯ",
"ใหญ่",
"หญ้า",
"ควาย",
"ความ",
"กริ่งเกรง",
"ผลิ",
}

# Diphthong prefixes (can start native Thai word)
_TH_PREFIX_DIPHTHONG = {"กะ", "กระ", "ปะ", "ประ"}

# Thai consonant filter
# O ANG (U+0E2D) is omitted, as it can be considered as vowel
_TH_CONSONANTS_PATTERN = re.compile(r"[ก-ฬฮ]", re.U)


def is_native_thai(word: str) -> bool:
"""
Check if a word is an "native Thai word" (Thai: "คำไทยแท้")
This function is based on a simple heuristic algorithm
and cannot be entirely reliable.
:param str word: word
:return: True or False
:rtype: bool
:Example:
English word::
from pythainlp.util import is_native_thai
is_native_thai("Avocado")
# output: False
Native Thai word::
is_native_thai("มะม่วง")
# output: True
is_native_thai("ตะวัน")
# output: True
Non-native Thai word::
is_native_thai("สามารถ")
# output: False
is_native_thai("อิสริยาภรณ์")
# output: False
"""
if not isinstance(word, str) or not word.strip():
return False

word = word.strip()

# Known native Thai words (exceptions)
if word in _TH_NATIVE_WORDS:
return True

# If a word contains non-Thai chars, it is not a native Thai
if any(ch in word for ch in _TH_NON_NATIVE_CHARS):
return False

# If it does not contain any Thai consonants -> it cannot be Thai
chs = re.findall(_TH_CONSONANTS_PATTERN, word)
if not chs:
return False

# If there's only one Thai consonant -> it can be a native Thai
if len(chs) == 1:
return True

# If a word ends with native final, it can be a native Thai
if word[-1] in _TH_NATIVE_FINALS:
return True

# Note: This will not work, as it check the whole word, not the prefix.
# Prefix-sensitive tokenization is required in order to be able to check this.
if word in _TH_PREFIX_DIPHTHONG:
return True

return False
135 changes: 11 additions & 124 deletions pythainlp/util/thaiwordcheck.py
Original file line number Diff line number Diff line change
@@ -1,129 +1,16 @@
# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
"""
Check if a word is a "native Thai word"
Adapted from
https://github.com/wannaphong/open-thai-nlp-document/blob/master/check_thai_word.md
References
- ทีมงานทรูปลูกปัญญา 2015. ลักษณะของคำไทยแท้ http://www.trueplookpanya.com/learning/detail/30589-043067
- วารุณี บำรุงรส 2010. คำไทยแท้ https://www.gotoknow.org/posts/377619
"""
import re

_THANTHAKHAT_CHAR = "\u0e4c" # Thanthakhat (cancellation of sound)

# Non-native Thai characters
_TH_NON_NATIVE_CHARS = {
"ฆ",
"ณ",
"ฌ",
"ฎ",
"ฏ",
"ฐ",
"ฑ",
"ฒ",
"ธ",
"ศ",
"ษ",
"ฬ",
_THANTHAKHAT_CHAR,
}

# Native Thai final consonants
_TH_NATIVE_FINALS = {"ก", "ด", "บ", "น", "ง", "ม", "ย", "ว"}

# Known native Thai words (exceptions)
_TH_NATIVE_WORDS = {
"ฆ่า",
"เฆี่ยน",
"ศึก",
"ศอก",
"เศิก",
"เศร้า",
"ธ",
"ณ",
"ฯพณฯ",
"ใหญ่",
"หญ้า",
"ควาย",
"ความ",
"กริ่งเกรง",
"ผลิ",
}

# Diphthong prefixes (can start native Thai word)
_TH_PREFIX_DIPHTHONG = {"กะ", "กระ", "ปะ", "ประ"}

# Thai consonant filter
# O ANG (U+0E2D) is omitted, as it can be considered as vowel
_TH_CONSONANTS_PATTERN = re.compile(r"[ก-ฬฮ]", re.U)

import warnings

def is_native_thai(word: str) -> bool:
"""
Check if a word is an "native Thai word" (Thai: "คำไทยแท้")
This function is based on a simple heuristic algorithm
and cannot be entirely reliable.
:param str word: word
:return: True or False
:rtype: bool
:Example:
English word::
from pythainlp.util import is_native_thai
is_native_thai("Avocado")
# output: False
Native Thai word::
is_native_thai("มะม่วง")
# output: True
is_native_thai("ตะวัน")
# output: True
Non-native Thai word::
is_native_thai("สามารถ")
# output: False
is_native_thai("อิสริยาภรณ์")
# output: False
"""
if not isinstance(word, str) or not word.strip():
return False

word = word.strip()

# Known native Thai words (exceptions)
if word in _TH_NATIVE_WORDS:
return True

# If a word contains non-Thai chars, it is not a native Thai
if any(ch in word for ch in _TH_NON_NATIVE_CHARS):
return False

# If it does not contain any Thai consonants -> it cannot be Thai
chs = re.findall(_TH_CONSONANTS_PATTERN, word)
if not chs:
return False

# If there's only one Thai consonant -> it can be a native Thai
if len(chs) == 1:
return True

# If a word ends with native final, it can be a native Thai
if word[-1] in _TH_NATIVE_FINALS:
return True

# Note: This will not work, as it check the whole word, not the prefix.
# Prefix-sensitive tokenization is required in order to be able to check this.
if word in _TH_PREFIX_DIPHTHONG:
return True

return False
warnings.warn(
"""
pythainlp.util.is_native_thai is rename as \
pythainlp.morpheme.is_native_thai.
This function will remove in PyThaiNLP 5.1.
"""
, DeprecationWarning)
from pythainlp.morpheme import is_native_thai as check

return check(word)
23 changes: 22 additions & 1 deletion tests/test_morpheme.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: Apache-2.0

import unittest
from pythainlp.morpheme import nighit
from pythainlp.morpheme import nighit, is_native_thai


class TestMorphemePackage(unittest.TestCase):
Expand All @@ -14,3 +14,24 @@ def test_nighit(self):
self.assertEqual(nighit("สํ", "นิษฐาน"), "สันนิษฐาน")
self.assertEqual(nighit("สํ", "ปทา"), "สัมปทา")
self.assertEqual(nighit("สํ", "โยค"), "สังโยค")

def test_is_native_thai(self):
self.assertEqual(is_native_thai(None), False)
self.assertEqual(is_native_thai(""), False)
self.assertEqual(is_native_thai("116"), False)
self.assertEqual(is_native_thai("abc"), False)
self.assertEqual(is_native_thai("ตา"), True)
self.assertEqual(is_native_thai("ยา"), True)
self.assertEqual(is_native_thai("ฆ่า"), True)
self.assertEqual(is_native_thai("คน"), True)
self.assertEqual(is_native_thai("กะ"), True)
self.assertEqual(is_native_thai("มอ"), True)
self.assertEqual(is_native_thai("กะ"), True)
self.assertEqual(is_native_thai("กระ"), True)
self.assertEqual(is_native_thai("ประท้วง"), True)
self.assertEqual(is_native_thai("ศา"), False)
self.assertEqual(is_native_thai("ลักษ์"), False)
self.assertEqual(is_native_thai("มาร์ค"), False)
self.assertEqual(is_native_thai("เลข"), False)
self.assertEqual(is_native_thai("เทเวศน์"), False)
self.assertEqual(is_native_thai("เทเวศร์"), False)

0 comments on commit 524759a

Please sign in to comment.