Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix coref return type and add fallback #883

Merged
merged 2 commits into from
Dec 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions docs/api/classify.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
.. currentmodule:: pythainlp.classify

pythainlp.classify
=============

.. autoclass:: GzipModel
:members:
7 changes: 0 additions & 7 deletions docs/api/cls.rst

This file was deleted.

File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"metadata": {},
"outputs": [],
"source": [
"import pythainlp.cls.param_free"
"import pythainlp.classify.param_free"
]
},
{
Expand Down Expand Up @@ -37,7 +37,7 @@
"metadata": {},
"outputs": [],
"source": [
"model = pythainlp.cls.param_free.GzipModel(training_data)"
"model = pythainlp.classify.param_free.GzipModel(training_data)"
]
},
{
Expand Down
File renamed without changes.
10 changes: 10 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[tool.ruff]
line-length = 79
indent-width = 4
target-version = "py38"

[tool.ruff.format]
quote-style = "double"
indent-style = "space"
skip-magic-trailing-comma = false
line-ending = "auto"
41 changes: 26 additions & 15 deletions pythainlp/chat/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,24 +10,26 @@ def __init__(self):
Chat using AI generation
"""
self.history = []

def reset_chat(self):
"""
Reset chat by cleaning history
"""
self.history = []

def load_model(
self,
model_name:str="wangchanglm",
return_dict:bool=True,
load_in_8bit:bool=False,
device:str="cuda",
model_name: str = "wangchanglm",
return_dict: bool = True,
load_in_8bit: bool = False,
device: str = "cuda",
torch_dtype=torch.float16,
offload_folder:str="./",
low_cpu_mem_usage:bool=True
offload_folder: str = "./",
low_cpu_mem_usage: bool = True,
):
"""
Load model

:param str model_name: Model name (Now, we support wangchanglm only)
:param bool return_dict: return_dict
:param bool load_in_8bit: load model in 8bit
Expand All @@ -38,6 +40,7 @@ def load_model(
"""
if model_name == "wangchanglm":
from pythainlp.generate.wangchanglm import WangChanGLM

self.model = WangChanGLM()
self.model.load_model(
model_path="pythainlp/wangchanglm-7.5B-sft-en-sharded",
Expand All @@ -46,14 +49,15 @@ def load_model(
offload_folder=offload_folder,
device=device,
torch_dtype=torch_dtype,
low_cpu_mem_usage=low_cpu_mem_usage
low_cpu_mem_usage=low_cpu_mem_usage,
)
else:
raise NotImplementedError(f"We doesn't support {model_name}.")
def chat(self, text:str)->str:

def chat(self, text: str) -> str:
"""
Chatbot

:param str text: text for asking chatbot with.
:return: answer from chatbot.
:rtype: str
Expand All @@ -72,11 +76,18 @@ def chat(self, text:str)->str:
print(chatbot.history)
# output: [('สวัสดี', 'ยินดีที่ได้รู้จัก')]
"""
_temp=""
_temp = ""
if self.history:
for h,b in self.history:
_temp+=self.model.PROMPT_DICT['prompt_chatbot'].format_map({"human":h,"bot":b})+self.model.stop_token
_temp+=self.model.PROMPT_DICT['prompt_chatbot'].format_map({"human":text,"bot":""})
for h, b in self.history:
_temp += (
self.model.PROMPT_DICT["prompt_chatbot"].format_map(
{"human": h, "bot": b}
)
+ self.model.stop_token
)
_temp += self.model.PROMPT_DICT["prompt_chatbot"].format_map(
{"human": text, "bot": ""}
)
_bot = self.model.gen_instruct(_temp)
self.history.append((text,_bot))
self.history.append((text, _bot))
return _bot
10 changes: 10 additions & 0 deletions pythainlp/classify/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
"""
pythainlp.classify
"""

__all__ = ["GzipModel"]

from pythainlp.classify.param_free import GzipModel
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@

class GzipModel:
"""
This class is a re-implementation of “Low-Resource” Text Classification: A Parameter-Free Classification Method with Compressors (Jiang et al., Findings 2023)
This class is a re-implementation of
“Low-Resource” Text Classification: A Parameter-Free Classification Method with Compressors
(Jiang et al., Findings 2023)

:param list training_data: list [(text_sample,label)]
"""
Expand All @@ -36,7 +38,7 @@ def predict(self, x1: str, k: int = 1) -> str:
:Example:
::

from pythainlp.cls import GzipModel
from pythainlp.classify import GzipModel

training_data = [
("รายละเอียดตามนี้เลยค่าา ^^", "Neutral"),
Expand All @@ -63,8 +65,10 @@ def predict(self, x1: str, k: int = 1) -> str:
# normalized compression distance
ncd = (Cx1x2 - min(Cx1, Cx2)) / max(Cx1, Cx2)
disance_from_x1.append(ncd)

sorted_idx = np.argsort(np.array(disance_from_x1))
top_k_class = self.training_data[sorted_idx[:k], 1]
_, counts = np.unique(top_k_class, return_counts=True)
predict_class = top_k_class[counts.argmax()]

return predict_class
8 changes: 7 additions & 1 deletion pythainlp/cls/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,14 @@
# SPDX-License-Identifier: Apache-2.0
"""
pythainlp.cls
Depreciated. Use pythainlp.classify instead.
"""
import warnings

__all__ = ["GzipModel"]

from pythainlp.cls.param_free import GzipModel
from pythainlp.classify.param_free import GzipModel

warnings.warn(
"Deprecated: Use pythainlp.classify instead.", DeprecationWarning
)
21 changes: 14 additions & 7 deletions pythainlp/coref/_fastcoref.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,13 @@


class FastCoref:
def __init__(self, model_name, nlp=spacy.blank("th"), device: str="cpu", type: str="FCoref") -> None:
def __init__(
self,
model_name,
nlp=spacy.blank("th"),
device: str = "cpu",
type: str = "FCoref",
) -> None:
if type == "FCoref":
from fastcoref import FCoref as _model
else:
Expand All @@ -17,11 +23,12 @@ def __init__(self, model_name, nlp=spacy.blank("th"), device: str="cpu", type: s

def _to_json(self, _predict):
return {
"text":_predict.text,
"clusters_string":_predict.get_clusters(as_strings=True),
"clusters":_predict.get_clusters(as_strings=False)
"text": _predict.text,
"clusters_string": _predict.get_clusters(as_strings=True),
"clusters": _predict.get_clusters(as_strings=False),
}


def predict(self, texts: List[str]) -> dict:
return [self._to_json(i) for i in self.model.predict(texts=texts)]
def predict(self, texts: List[str]) -> List[dict]:
return [
self._to_json(pred) for pred in self.model.predict(texts=texts)
]
27 changes: 20 additions & 7 deletions pythainlp/coref/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,26 @@
# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
from typing import List

model = None


def coreference_resolution(texts:List[str], model_name:str="han-coref-v1.0", device:str="cpu"):
def coreference_resolution(
texts: List[str], model_name: str = "han-coref-v1.0", device: str = "cpu"
):
"""
Coreference Resolution

:param List[str] texts: list of texts to apply coreference resolution to
:param str model_name: coreference resolution model
:param str device: device for running coreference resolution model on (cpu, cuda, and others)
:param str device: device for running coreference resolution model on\
("cpu", "cuda", and others)
:return: List of texts with coreference resolution
:rtype: List[dict]

:Options for model_name:
* *han-coref-v1.0* - (default) Han-Corf: Thai oreference resolution by PyThaiNLP v1.0
* *han-coref-v1.0* - (default) Han-Coref: Thai coreference resolution\
by PyThaiNLP v1.0

:Example:
::
Expand All @@ -30,15 +35,23 @@ def coreference_resolution(texts:List[str], model_name:str="han-coref-v1.0", dev
)
# output:
# [
# {'text': 'Bill Gates ได้รับวัคซีน COVID-19 เข็มแรกแล้ว ระบุ ผมรู้สึกสบายมาก',
# 'clusters_string': [['Bill Gates', 'ผม']],
# {'text': 'Bill Gates ได้รับวัคซีน COVID-19 เข็มแรกแล้ว ระบุ ผมรู้สึกสบายมาก',
# 'clusters_string': [['Bill Gates', 'ผม']],
# 'clusters': [[(0, 10), (50, 52)]]}
# ]
"""
global model
if isinstance(texts, str):
texts = [texts]
if model is None and model_name=="han-coref-v1.0":

if model is None and model_name == "han-coref-v1.0":
from pythainlp.coref.han_coref import HanCoref

model = HanCoref(device=device)
return model.predict(texts)

if model:
return model.predict(texts)

return [
{"text": text, "clusters_string": [], "clusters": []} for text in texts
]
6 changes: 2 additions & 4 deletions pythainlp/coref/han_coref.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,7 @@


class HanCoref(FastCoref):
def __init__(self,device:str="cpu",nlp=spacy.blank("th")) -> None:
def __init__(self, device: str = "cpu", nlp=spacy.blank("th")) -> None:
super().__init__(
model_name="pythainlp/han-coref-v1.0",
device=device,
nlp=nlp
model_name="pythainlp/han-coref-v1.0", device=device, nlp=nlp
)
12 changes: 6 additions & 6 deletions pythainlp/tag/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,17 @@

__all__ = [
"PerceptronTagger",
"NER",
"NNER",
"chunk_parse",
"pos_tag",
"pos_tag_sents",
"pos_tag_transformers",
"tag_provinces",
"chunk_parse",
"NER",
"NNER",
"pos_tag_transformers"
]

from pythainlp.tag.locations import tag_provinces
from pythainlp.tag.pos_tag import pos_tag, pos_tag_sents, pos_tag_transformers
from pythainlp.tag._tag_perceptron import PerceptronTagger
from pythainlp.tag.chunk import chunk_parse
from pythainlp.tag.locations import tag_provinces
from pythainlp.tag.named_entity import NER, NNER
from pythainlp.tag.pos_tag import pos_tag, pos_tag_sents, pos_tag_transformers
20 changes: 20 additions & 0 deletions tests/test_classify.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# -*- coding: utf-8 -*-
import unittest
from pythainlp.classify import GzipModel


class TestClsPackage(unittest.TestCase):
def test_GzipModel(self):
training_data = [
("รายละเอียดตามนี้เลยค่าา ^^", "Neutral"),
("กลัวพวกมึงหาย อดกินบาบิก้อน", "Neutral"),
("บริการแย่มากก เป็นหมอได้ไง😤", "Negative"),
("ขับรถแย่มาก", "Negative"),
("ดีนะครับ", "Positive"),
("ลองแล้วรสนี้อร่อย... ชอบๆ", "Positive"),
("ฉันรู้สึกโกรธ เวลามือถือแบตหมด", "Negative"),
("เธอภูมิใจที่ได้ทำสิ่งดี ๆ และดีใจกับเด็ก ๆ", "Positive"),
("นี่เป็นบทความหนึ่ง", "Neutral"),
]
model = GzipModel(training_data)
self.assertEqual(model.predict("ฉันดีใจ", k=1), "Positive")
20 changes: 0 additions & 20 deletions tests/test_cls.py

This file was deleted.

Loading