PyThaiNLP · bact · Dec 7, 2023 · Dec 7, 2023 · Dec 7, 2023
diff --git a/docs/api/classify.rst b/docs/api/classify.rst
@@ -0,0 +1,7 @@
+.. currentmodule:: pythainlp.classify
+
+pythainlp.classify
+=============
+
+.. autoclass:: GzipModel
+   :members:
diff --git a/docs/api/cls.rst b/docs/api/cls.rst
diff --git a/notebooks/clean-dict.ipynb → notebooks/clean_dict.ipynb b/notebooks/clean-dict.ipynb → notebooks/clean_dict.ipynb
diff --git a/notebooks/test-aksonhan.ipynb → notebooks/test_aksonhan.ipynb b/notebooks/test-aksonhan.ipynb → notebooks/test_aksonhan.ipynb
diff --git a/notebooks/test-chat.ipynb → notebooks/test_chat.ipynb b/notebooks/test-chat.ipynb → notebooks/test_chat.ipynb
diff --git a/notebooks/test_gzip_cls.ipynb → notebooks/test_gzip_classify.ipynb b/notebooks/test_gzip_cls.ipynb → notebooks/test_gzip_classify.ipynb
@@ -7,7 +7,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import pythainlp.cls.param_free"
+    "import pythainlp.classify.param_free"
    ]
   },
   {
@@ -37,7 +37,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model = pythainlp.cls.param_free.GzipModel(training_data)"
+    "model = pythainlp.classify.param_free.GzipModel(training_data)"
    ]
   },
   {

diff --git a/notebooks/test-wangchanglm.ipynb → notebooks/test_wangchanglm.ipynb b/notebooks/test-wangchanglm.ipynb → notebooks/test_wangchanglm.ipynb
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,10 @@
+[tool.ruff]
+line-length = 79
+indent-width = 4
+target-version = "py38"
+
+[tool.ruff.format]
+quote-style = "double"
+indent-style = "space"
+skip-magic-trailing-comma = false
+line-ending = "auto"
diff --git a/pythainlp/chat/core.py b/pythainlp/chat/core.py
@@ -10,24 +10,26 @@ def __init__(self):
         Chat using AI generation
         """
         self.history = []
+
     def reset_chat(self):
         """
         Reset chat by cleaning history
         """
         self.history = []
+
     def load_model(
         self,
-        model_name:str="wangchanglm",
-        return_dict:bool=True,
-        load_in_8bit:bool=False,
-        device:str="cuda",
+        model_name: str = "wangchanglm",
+        return_dict: bool = True,
+        load_in_8bit: bool = False,
+        device: str = "cuda",
         torch_dtype=torch.float16,
-        offload_folder:str="./",
-        low_cpu_mem_usage:bool=True
+        offload_folder: str = "./",
+        low_cpu_mem_usage: bool = True,
     ):
         """
         Load model
-        
+
         :param str model_name: Model name (Now, we support wangchanglm only)
         :param bool return_dict: return_dict
         :param bool load_in_8bit: load model in 8bit
@@ -38,6 +40,7 @@ def load_model(
         """
         if model_name == "wangchanglm":
             from pythainlp.generate.wangchanglm import WangChanGLM
+
             self.model = WangChanGLM()
             self.model.load_model(
                 model_path="pythainlp/wangchanglm-7.5B-sft-en-sharded",
@@ -46,14 +49,15 @@ def load_model(
                 offload_folder=offload_folder,
                 device=device,
                 torch_dtype=torch_dtype,
-                low_cpu_mem_usage=low_cpu_mem_usage
+                low_cpu_mem_usage=low_cpu_mem_usage,
             )
         else:
             raise NotImplementedError(f"We doesn't support {model_name}.")
-    def chat(self, text:str)->str:
+
+    def chat(self, text: str) -> str:
         """
         Chatbot
-        
+
         :param str text: text for asking chatbot with.
         :return: answer from chatbot.
         :rtype: str
@@ -72,11 +76,18 @@ def chat(self, text:str)->str:
                 print(chatbot.history)
                 # output: [('สวัสดี', 'ยินดีที่ได้รู้จัก')]
         """
-        _temp=""
+        _temp = ""
         if self.history:
-            for h,b in self.history:
-                _temp+=self.model.PROMPT_DICT['prompt_chatbot'].format_map({"human":h,"bot":b})+self.model.stop_token
-        _temp+=self.model.PROMPT_DICT['prompt_chatbot'].format_map({"human":text,"bot":""})
+            for h, b in self.history:
+                _temp += (
+                    self.model.PROMPT_DICT["prompt_chatbot"].format_map(
+                        {"human": h, "bot": b}
+                    )
+                    + self.model.stop_token
+                )
+        _temp += self.model.PROMPT_DICT["prompt_chatbot"].format_map(
+            {"human": text, "bot": ""}
+        )
         _bot = self.model.gen_instruct(_temp)
-        self.history.append((text,_bot))
+        self.history.append((text, _bot))
         return _bot
diff --git a/pythainlp/classify/__init__.py b/pythainlp/classify/__init__.py
@@ -0,0 +1,10 @@
+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
+# SPDX-License-Identifier: Apache-2.0
+"""
+pythainlp.classify
+"""
+
+__all__ = ["GzipModel"]
+
+from pythainlp.classify.param_free import GzipModel
diff --git a/pythainlp/cls/param_free.py → pythainlp/classify/param_free.py b/pythainlp/cls/param_free.py → pythainlp/classify/param_free.py
@@ -9,7 +9,9 @@
 
 class GzipModel:
     """
-    This class is a re-implementation of “Low-Resource” Text Classification: A Parameter-Free Classification Method with Compressors (Jiang et al., Findings 2023)
+    This class is a re-implementation of
+    “Low-Resource” Text Classification: A Parameter-Free Classification Method with Compressors
+    (Jiang et al., Findings 2023)
 
     :param list training_data: list [(text_sample,label)]
     """
@@ -36,7 +38,7 @@ def predict(self, x1: str, k: int = 1) -> str:
         :Example:
         ::
 
-                from pythainlp.cls import GzipModel
+                from pythainlp.classify import GzipModel
 
                 training_data =  [
                     ("รายละเอียดตามนี้เลยค่าา ^^", "Neutral"),
@@ -63,8 +65,10 @@ def predict(self, x1: str, k: int = 1) -> str:
             # normalized compression distance
             ncd = (Cx1x2 - min(Cx1, Cx2)) / max(Cx1, Cx2)
             disance_from_x1.append(ncd)
+
         sorted_idx = np.argsort(np.array(disance_from_x1))
         top_k_class = self.training_data[sorted_idx[:k], 1]
         _, counts = np.unique(top_k_class, return_counts=True)
         predict_class = top_k_class[counts.argmax()]
+
         return predict_class
diff --git a/pythainlp/cls/__init__.py b/pythainlp/cls/__init__.py
@@ -3,8 +3,14 @@
 # SPDX-License-Identifier: Apache-2.0
 """
 pythainlp.cls
+Depreciated. Use pythainlp.classify instead.
 """
+import warnings
 
 __all__ = ["GzipModel"]
 
-from pythainlp.cls.param_free import GzipModel
+from pythainlp.classify.param_free import GzipModel
+
+warnings.warn(
+    "Deprecated: Use pythainlp.classify instead.", DeprecationWarning
+)
diff --git a/pythainlp/coref/_fastcoref.py b/pythainlp/coref/_fastcoref.py
@@ -6,7 +6,13 @@
 
 
 class FastCoref:
-    def __init__(self, model_name, nlp=spacy.blank("th"), device: str="cpu", type: str="FCoref") -> None:
+    def __init__(
+        self,
+        model_name,
+        nlp=spacy.blank("th"),
+        device: str = "cpu",
+        type: str = "FCoref",
+    ) -> None:
         if type == "FCoref":
             from fastcoref import FCoref as _model
         else:
@@ -17,11 +23,12 @@ def __init__(self, model_name, nlp=spacy.blank("th"), device: str="cpu", type: s
 
     def _to_json(self, _predict):
         return {
-            "text":_predict.text,
-            "clusters_string":_predict.get_clusters(as_strings=True),
-            "clusters":_predict.get_clusters(as_strings=False)
+            "text": _predict.text,
+            "clusters_string": _predict.get_clusters(as_strings=True),
+            "clusters": _predict.get_clusters(as_strings=False),
         }
 
-
-    def predict(self, texts: List[str]) -> dict:
-        return [self._to_json(i) for i in self.model.predict(texts=texts)]
+    def predict(self, texts: List[str]) -> List[dict]:
+        return [
+            self._to_json(pred) for pred in self.model.predict(texts=texts)
+        ]
diff --git a/pythainlp/coref/core.py b/pythainlp/coref/core.py
@@ -2,21 +2,26 @@
 # SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
 # SPDX-License-Identifier: Apache-2.0
 from typing import List
+
 model = None
 
 
-def coreference_resolution(texts:List[str], model_name:str="han-coref-v1.0", device:str="cpu"):
+def coreference_resolution(
+    texts: List[str], model_name: str = "han-coref-v1.0", device: str = "cpu"
+):
     """
     Coreference Resolution
 
     :param List[str] texts: list of texts to apply coreference resolution to
     :param str model_name: coreference resolution model
-    :param str device: device for running coreference resolution model on (cpu, cuda, and others)
+    :param str device: device for running coreference resolution model on\
+        ("cpu", "cuda", and others)
     :return: List of texts with coreference resolution
     :rtype: List[dict]
 
     :Options for model_name:
-        * *han-coref-v1.0* - (default) Han-Corf: Thai oreference resolution by PyThaiNLP v1.0
+        * *han-coref-v1.0* - (default) Han-Coref: Thai coreference resolution\
+            by PyThaiNLP v1.0
 
     :Example:
     ::
@@ -30,15 +35,23 @@ def coreference_resolution(texts:List[str], model_name:str="han-coref-v1.0", dev
         )
         # output:
         # [
-        # {'text': 'Bill Gates ได้รับวัคซีน COVID-19 เข็มแรกแล้ว ระบุ ผมรู้สึกสบายมาก', 
-        # 'clusters_string': [['Bill Gates', 'ผม']], 
+        # {'text': 'Bill Gates ได้รับวัคซีน COVID-19 เข็มแรกแล้ว ระบุ ผมรู้สึกสบายมาก',
+        # 'clusters_string': [['Bill Gates', 'ผม']],
         # 'clusters': [[(0, 10), (50, 52)]]}
         # ]
     """
     global model
     if isinstance(texts, str):
         texts = [texts]
-    if model is None and model_name=="han-coref-v1.0":
+
+    if model is None and model_name == "han-coref-v1.0":
         from pythainlp.coref.han_coref import HanCoref
+
         model = HanCoref(device=device)
-    return model.predict(texts)
+
+    if model:
+        return model.predict(texts)
+
+    return [
+        {"text": text, "clusters_string": [], "clusters": []} for text in texts
+    ]
diff --git a/pythainlp/coref/han_coref.py b/pythainlp/coref/han_coref.py
@@ -6,9 +6,7 @@
 
 
 class HanCoref(FastCoref):
-    def __init__(self,device:str="cpu",nlp=spacy.blank("th")) -> None:
+    def __init__(self, device: str = "cpu", nlp=spacy.blank("th")) -> None:
         super().__init__(
-            model_name="pythainlp/han-coref-v1.0",
-            device=device,
-            nlp=nlp
+            model_name="pythainlp/han-coref-v1.0", device=device, nlp=nlp
         )
diff --git a/pythainlp/tag/__init__.py b/pythainlp/tag/__init__.py
@@ -10,17 +10,17 @@
 
 __all__ = [
     "PerceptronTagger",
+    "NER",
+    "NNER",
+    "chunk_parse",
     "pos_tag",
     "pos_tag_sents",
+    "pos_tag_transformers",
     "tag_provinces",
-    "chunk_parse",
-    "NER",
-    "NNER",
-    "pos_tag_transformers"
 ]
 
-from pythainlp.tag.locations import tag_provinces
-from pythainlp.tag.pos_tag import pos_tag, pos_tag_sents, pos_tag_transformers
 from pythainlp.tag._tag_perceptron import PerceptronTagger
 from pythainlp.tag.chunk import chunk_parse
+from pythainlp.tag.locations import tag_provinces
 from pythainlp.tag.named_entity import NER, NNER
+from pythainlp.tag.pos_tag import pos_tag, pos_tag_sents, pos_tag_transformers
diff --git a/tests/test_classify.py b/tests/test_classify.py
@@ -0,0 +1,20 @@
+# -*- coding: utf-8 -*-
+import unittest
+from pythainlp.classify import GzipModel
+
+
+class TestClsPackage(unittest.TestCase):
+    def test_GzipModel(self):
+        training_data = [
+            ("รายละเอียดตามนี้เลยค่าา ^^", "Neutral"),
+            ("กลัวพวกมึงหาย อดกินบาบิก้อน", "Neutral"),
+            ("บริการแย่มากก เป็นหมอได้ไง😤", "Negative"),
+            ("ขับรถแย่มาก", "Negative"),
+            ("ดีนะครับ", "Positive"),
+            ("ลองแล้วรสนี้อร่อย... ชอบๆ", "Positive"),
+            ("ฉันรู้สึกโกรธ เวลามือถือแบตหมด", "Negative"),
+            ("เธอภูมิใจที่ได้ทำสิ่งดี ๆ และดีใจกับเด็ก ๆ", "Positive"),
+            ("นี่เป็นบทความหนึ่ง", "Neutral"),
+        ]
+        model = GzipModel(training_data)
+        self.assertEqual(model.predict("ฉันดีใจ", k=1), "Positive")
diff --git a/tests/test_cls.py b/tests/test_cls.py