Skip to content

Commit

Permalink
Merge branch 'hotfix/0.2.6'
Browse files Browse the repository at this point in the history
  • Loading branch information
emfomy committed Jun 9, 2021
2 parents 3e1d007 + f60f9d2 commit 910b3df
Show file tree
Hide file tree
Showing 12 changed files with 217 additions and 209 deletions.
8 changes: 6 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,25 @@ RM = rm -rf
TWINE = twine
TOX = tox
LINT = pylint --rcfile=./.pylintrc
FORMAT = $(PY) -m black --color --diff

.PHONY: all check dist sdist test tox tox-v tox-vv tox-report lint doc upload clean
.PHONY: all check dist sdist test tox tox-v tox-vv tox-report lint format doc upload clean

all: dist check test

dist: sdist bdist_wheel

test: tox lint
test: tox format lint

sdist bdist_wheel:
$(PY) setup.py $@

lint:
$(LINT) ckip_transformers

format:
$(FORMAT) ckip_transformers

check:
$(TWINE) check dist/*

Expand Down
18 changes: 9 additions & 9 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ NLP Tools Usage
.. code-block:: python
# Initialize drivers
ws_driver = CkipWordSegmenter(level=3)
ws_driver = CkipWordSegmenter(level=3)
pos_driver = CkipPosTagger(level=3)
ner_driver = CkipNerChunker(level=3)
Expand All @@ -314,9 +314,9 @@ NLP Tools Usage
.. code-block:: python
# Initialize drivers with custom checkpoints
ws_driver = CkipWordSegmenter(model_name='path_to_your_model')
pos_driver = CkipPosTagger(model_name='path_to_your_model')
ner_driver = CkipNerChunker(model_name='path_to_your_model')
ws_driver = CkipWordSegmenter(model_name="path_to_your_model")
pos_driver = CkipPosTagger(model_name="path_to_your_model")
ner_driver = CkipNerChunker(model_name="path_to_your_model")
| To use GPU, one may specify device ID while initialize the drivers. Set to -1 (default) to disable GPU.
| 可於宣告斷詞等工具時指定 device 以使用 GPU,設為 -1 (預設值)代表不使用 GPU。
Expand All @@ -341,9 +341,9 @@ NLP Tools Usage
# Input text
text = [
'傅達仁今將執行安樂死,卻突然爆出自己20年前遭緯來體育台封殺,他不懂自己哪裡得罪到電視台。',
'美國參議院針對今天總統布什所提名的勞工部長趙小蘭展開認可聽證會,預料她將會很順利通過參議院支持,成為該國有史以來第一位的華裔女性內閣成員。',
'空白 也是可以的~',
"傅達仁今將執行安樂死,卻突然爆出自己20年前遭緯來體育台封殺,他不懂自己哪裡得罪到電視台。",
"美國參議院針對今天總統布什所提名的勞工部長趙小蘭展開認可聽證會,預料她將會很順利通過參議院支持,成為該國有史以來第一位的華裔女性內閣成員。",
"空白 也是可以的~",
]
# Run pipeline
Expand Down Expand Up @@ -386,8 +386,8 @@ NLP Tools Usage
assert len(sentence_ws) == len(sentence_pos)
res = []
for word_ws, word_pos in zip(sentence_ws, sentence_pos):
res.append(f'{word_ws}({word_pos})')
return '\u3000'.join(res)
res.append(f"{word_ws}({word_pos})")
return "\u3000".join(res)
# Show results
for sentence, sentence_ws, sentence_pos, sentence_ner in zip(text, ws, pos, ner):
Expand Down
18 changes: 9 additions & 9 deletions ckip_transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@
The CKIP Transformers.
"""

__author_name__ = 'Mu Yang'
__author_email__ = 'emfomy@gmail.com'
__copyright__ = '2020 CKIP Lab'
__author_name__ = "Mu Yang"
__author_email__ = "emfomy@gmail.com"
__copyright__ = "2020 CKIP Lab"

__title__ = 'CKIP Transformers'
__version__ = '0.2.5'
__description__ = 'CKIP Transformers'
__license__ = 'GPL-3.0'
__title__ = "CKIP Transformers"
__version__ = "0.2.6"
__description__ = "CKIP Transformers"
__license__ = "GPL-3.0"

__url__ = 'https://ckip-transformers.readthedocs.io'
__download_url__ = __url__+'/tarball/'+__version__
__url__ = "https://ckip-transformers.readthedocs.io"
__download_url__ = __url__ + "/tarball/" + __version__
7 changes: 4 additions & 3 deletions ckip_transformers/nlp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
#!/usr/bin/env python3
# -*- coding:utf-8 -*-

"""
This module provides the CKIP Transformers NLP drivers.
"""

__author__ = 'Mu Yang <http://muyang.pro>'
__copyright__ = '2020 CKIP Lab'
__license__ = 'GPL-3.0'
__author__ = "Mu Yang <http://muyang.pro>"
__copyright__ = "2020 CKIP Lab"
__license__ = "GPL-3.0"

from .driver import (
CkipWordSegmenter,
Expand Down
121 changes: 61 additions & 60 deletions ckip_transformers/nlp/driver.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
#!/usr/bin/env python3
# -*- coding:utf-8 -*-

"""
This module implements the CKIP Transformers NLP drivers.
"""

__author__ = 'Mu Yang <http://muyang.pro>'
__copyright__ = '2020 CKIP Lab'
__license__ = 'GPL-3.0'
__author__ = "Mu Yang <http://muyang.pro>"
__copyright__ = "2020 CKIP Lab"
__license__ = "GPL-3.0"

from typing import (
List, )
List,
)

import numpy as np

Expand All @@ -24,30 +26,29 @@
class CkipWordSegmenter(CkipTokenClassification):
"""The word segmentation driver.
Parameters
----------
level : ``str`` *optional*, defaults to 3, must be 1—3
The model level. The higher the level is, the more accurate and slower the model is.
model_name : ``str`` *optional*, overwrites **level**
The pretrained model name (e.g. ``'ckiplab/bert-base-chinese-ws'``).
device : ``int``, *optional*, defaults to -1,
Device ordinal for CPU/GPU supports.
Setting this to -1 will leverage CPU, a positive will run the model on the associated CUDA device id.
Parameters
----------
level : ``str`` *optional*, defaults to 3, must be 1—3
The model level. The higher the level is, the more accurate and slower the model is.
model_name : ``str`` *optional*, overwrites **level**
The pretrained model name (e.g. ``'ckiplab/bert-base-chinese-ws'``).
device : ``int``, *optional*, defaults to -1,
Device ordinal for CPU/GPU supports.
Setting this to -1 will leverage CPU, a positive will run the model on the associated CUDA device id.
"""

_model_names = {
1: 'ckiplab/albert-tiny-chinese-ws',
2: 'ckiplab/albert-base-chinese-ws',
3: 'ckiplab/bert-base-chinese-ws',
1: "ckiplab/albert-tiny-chinese-ws",
2: "ckiplab/albert-base-chinese-ws",
3: "ckiplab/bert-base-chinese-ws",
}

def __init__(
self,
level: int = 3,
**kwargs,
):
model_name = kwargs.pop('model_name',
self._get_model_name_from_level(level))
model_name = kwargs.pop("model_name", self._get_model_name_from_level(level))
super().__init__(model_name=model_name, **kwargs)

def __call__(
Expand Down Expand Up @@ -91,13 +92,13 @@ def __call__(
output_text = []
for sent_data in zip(input_text, index_map):
output_sent = []
word = ''
word = ""
for input_char, logits_index in zip(*sent_data):
if logits_index is None:
if word:
output_sent.append(word)
output_sent.append(input_char)
word = ''
word = ""
else:
logits_b, logits_i = logits[logits_index]

Expand All @@ -121,30 +122,29 @@ def __call__(
class CkipPosTagger(CkipTokenClassification):
"""The part-of-speech tagging driver.
Parameters
----------
level : ``str`` *optional*, defaults to 3, must be 1—3
The model level. The higher the level is, the more accurate and slower the model is.
model_name : ``str`` *optional*, overwrites **level**
The pretrained model name (e.g. ``'ckiplab/bert-base-chinese-pos'``).
device : ``int``, *optional*, defaults to -1,
Device ordinal for CPU/GPU supports.
Setting this to -1 will leverage CPU, a positive will run the model on the associated CUDA device id.
Parameters
----------
level : ``str`` *optional*, defaults to 3, must be 1—3
The model level. The higher the level is, the more accurate and slower the model is.
model_name : ``str`` *optional*, overwrites **level**
The pretrained model name (e.g. ``'ckiplab/bert-base-chinese-pos'``).
device : ``int``, *optional*, defaults to -1,
Device ordinal for CPU/GPU supports.
Setting this to -1 will leverage CPU, a positive will run the model on the associated CUDA device id.
"""

_model_names = {
1: 'ckiplab/albert-tiny-chinese-pos',
2: 'ckiplab/albert-base-chinese-pos',
3: 'ckiplab/bert-base-chinese-pos',
1: "ckiplab/albert-tiny-chinese-pos",
2: "ckiplab/albert-base-chinese-pos",
3: "ckiplab/bert-base-chinese-pos",
}

def __init__(
self,
level: int = 3,
**kwargs,
):
model_name = kwargs.pop('model_name',
self._get_model_name_from_level(level))
model_name = kwargs.pop("model_name", self._get_model_name_from_level(level))
super().__init__(model_name=model_name, **kwargs)

def __call__(
Expand Down Expand Up @@ -193,7 +193,7 @@ def __call__(
output_sent = []
for input_char, logits_index in zip(*sent_data):
if logits_index is None or input_char.isspace():
label = 'WHITESPACE'
label = "WHITESPACE"
else:
label = id2label[np.argmax(logits[logits_index])]
output_sent.append(label)
Expand All @@ -208,30 +208,29 @@ def __call__(
class CkipNerChunker(CkipTokenClassification):
"""The named-entity recognition driver.
Parameters
----------
level : ``str`` *optional*, defaults to 3, must be 1—3
The model level. The higher the level is, the more accurate and slower the model is.
model_name : ``str`` *optional*, overwrites **level**
The pretrained model name (e.g. ``'ckiplab/bert-base-chinese-ner'``).
device : ``int``, *optional*, defaults to -1,
Device ordinal for CPU/GPU supports.
Setting this to -1 will leverage CPU, a positive will run the model on the associated CUDA device id.
Parameters
----------
level : ``str`` *optional*, defaults to 3, must be 1—3
The model level. The higher the level is, the more accurate and slower the model is.
model_name : ``str`` *optional*, overwrites **level**
The pretrained model name (e.g. ``'ckiplab/bert-base-chinese-ner'``).
device : ``int``, *optional*, defaults to -1,
Device ordinal for CPU/GPU supports.
Setting this to -1 will leverage CPU, a positive will run the model on the associated CUDA device id.
"""

_model_names = {
1: 'ckiplab/albert-tiny-chinese-ner',
2: 'ckiplab/albert-base-chinese-ner',
3: 'ckiplab/bert-base-chinese-ner',
1: "ckiplab/albert-tiny-chinese-ner",
2: "ckiplab/albert-base-chinese-ner",
3: "ckiplab/bert-base-chinese-ner",
}

def __init__(
self,
level: int = 3,
**kwargs,
):
model_name = kwargs.pop('model_name',
self._get_model_name_from_level(level))
model_name = kwargs.pop("model_name", self._get_model_name_from_level(level))
super().__init__(model_name=model_name, **kwargs)

def __call__(
Expand Down Expand Up @@ -282,21 +281,21 @@ def __call__(
entity_ner = None
entity_idx0 = None
for index_char, (
input_char,
logits_index,
input_char,
logits_index,
) in enumerate(zip(*sent_data)):
if logits_index is None:
label = 'O'
label = "O"
else:
label = id2label[np.argmax(logits[logits_index])]

if label == 'O':
if label == "O":
entity_ner = None
continue

bioes, ner = label.split('-')
bioes, ner = label.split("-")

if bioes == 'S':
if bioes == "S":
output_sent.append(
NerToken(
word=input_char,
Expand All @@ -305,18 +304,19 @@ def __call__(
index_char,
index_char + len(input_char),
),
))
)
)
entity_ner = None
elif bioes == 'B':
elif bioes == "B":
entity_word = input_char
entity_ner = ner
entity_idx0 = index_char
elif bioes == 'I':
elif bioes == "I":
if entity_ner == ner:
entity_word += input_char
else:
entity_ner = None
elif bioes == 'E':
elif bioes == "E":
if entity_ner == ner:
entity_word += input_char
output_sent.append(
Expand All @@ -327,7 +327,8 @@ def __call__(
entity_idx0,
index_char + len(input_char),
),
))
)
)
entity_ner = None

output_text.append(output_sent)
Expand Down
Loading

0 comments on commit 910b3df

Please sign in to comment.