Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize tagger logic using numpy #35

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/).
### Changed
- Updated versions of the dependencies: `pylangacq >= 0.17.0` and `wordseg >= 0.0.4`.
- Restructured the repository to use top-level `src/` and `tests/` directories.
- Rewrite some math logic of tagger using numpy to gain ~3x performance.

### Deprecated
### Removed
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ keywords = [
dependencies = [
'importlib-metadata >= 1.0; python_version < "3.8"',
'pylangacq >= 0.17.0',
'numpy >= 1.23.0',
'wordseg >= 0.0.4',
]
classifiers = [
Expand Down
4 changes: 2 additions & 2 deletions src/pycantonese/pos_tagging/tagger.pickle
Git LFS file not shown
192 changes: 125 additions & 67 deletions src/pycantonese/pos_tagging/tagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
import pickle # nosec
import random

from typing import Dict
from typing import Dict, Iterable, List, Hashable

import numpy

from pycantonese._punctuation_marks import _PUNCTUATION_MARKS
from pycantonese.pos_tagging.hkcancor_to_ud import hkcancor_to_ud
Expand All @@ -21,6 +23,21 @@
_THIS_DIR = os.path.dirname(os.path.abspath(__file__))
_PICKLE_PATH = os.path.join(_THIS_DIR, "tagger.pickle")

# Features prefixes.
_F_BIAS = "bias"
_F_CUR_WORD_FIRST_CHAR = "i word's first char"
_F_CUR_WORD_FINAL_CHAR = "i word's final char"
_F_PREV_WORD_FIRST_CHAR = "i-1 word's first char"
_F_PREV_WORD_FINAL_CHAR = "i-1 word's final char"
_F_PREV_TAG = "i-1 tag"
_F_PREV2_WORD_FIRST_CHAR = "i-2 word's first char"
_F_PREV2_WORD_FINAL_CHAR = "i-2 word's final char"
_F_PREV2_TAG = "i-2 tag"
_F_NEXT_WORD_FIRST_CHAR = "i+1 word's first char"
_F_NEXT_WORD_FINAL_CHAR = "i+1 word's final char"
_F_NEXT2_WORD_FIRST_CHAR = "i+2 word's first char"
_F_NEXT2_WORD_FINAL_CHAR = "i+2 word's final char"


class _AveragedPerceptron:
"""An averaged perceptron.
Expand All @@ -31,64 +48,77 @@ class _AveragedPerceptron:
"""

def __init__(self):
# Each feature (key) gets its own weight vector (value).
self.weights: Dict[str, Dict[str, float]] = {}
self.classes = set()
# The accumulated values, for the averaging. These will be keyed by
# feature/class tuples
self._totals = collections.defaultdict(int)
# The last time the feature was changed, for the averaging. Also
# keyed by feature/class tuples
self.classes: List[str] = []
self.features: List[Hashable] = []
# Maps class/label into row index of the weights matrix.
self._class_to_index: Dict[str, int] = {}
# Maps feature into column index of the weights matrix.
self._feature_to_index: Dict[Hashable, int] = {}
# Matrix represented by 2D array of shape (n_classes, n_features).
self._weights = numpy.zeros(1)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like this _weights array is a pretty sparse matrix at the end of the day (2 million values, with ~93% being zero). What do you think about switching to one of those sparse matrix representations from scipy (I'd be okay with adding scipy as a dependency) to lower the memory footprint? Speaking of which, maybe we can use float32 instead of the default float64 in numpy/scipy to save more memory? I doubt if the higher precision of float64 matters for our purposes here.

Copy link
Author

@ZhanruiLiang ZhanruiLiang Nov 17, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changed to float32. I tried but scipy sparse matrices are not a good fit here. It's expensive to update sparse matrix that supports fast arithmetic, but we need to do such updates in this case. As a result, it's much slower, e.g. a training iteration takes like 10s while the current impl takes <1s. After changing the float32, the memory footprint is <8 MiB which I don't think it's really a concern. Also, importing scipy will definitely take more than 8 MiB.


# The following attributes are only used for trainning

# The accumulated values, for the averaging. Has same shape as _weights.
self._totals = numpy.zeros(1)
# The last iteration the feature was changed, for the averaging.
# Has same shape as _weights.
# (tstamps is short for timestamps)
self._tstamps = collections.defaultdict(int)
self._tstamps = numpy.zeros(1)
# Number of instances seen
self.i = 0

def predict(self, features):
def rescope(self, features: Iterable[Hashable], classes: Iterable[str]):
""" "Change the features and classes.

Assume they won't change until next call.
"""
self.features = list(features)
self.classes = sorted(classes)
self._weights = numpy.zeros(
(len(self.classes), len(self.features)), dtype=numpy.float64
)
self._totals = self._weights.copy()
self._tstamps = numpy.zeros(self._weights.shape, dtype=numpy.int32)
self._feature_to_index = {f: i for i, f in enumerate(self.features)}
self._class_to_index = {c: i for i, c in enumerate(self.classes)}

def predict(self, features: Dict[Hashable, float]):
"""Return the best label for the given features.

It's computed based on the dot-product between the features and
current weights.
"""
scores = collections.defaultdict(float)
for feat, value in features.items():
if feat not in self.weights or value == 0:
continue
weights = self.weights[feat]
for label, weight in weights.items():
scores[label] += value * weight
# Do a secondary alphabetic sort, for stability
return max(self.classes, key=lambda label: (scores[label], label))

def update(self, truth, guess, features):
fs, vs = zip(*(i for i in features.items() if i[0] in self._feature_to_index))
# The feature values vector.
fvec = numpy.array(vs)
weights = self._weights[:, [self._feature_to_index[f] for f in fs]]
return self.classes[weights.dot(fvec).argmax()]

def update(self, truth: str, guess: str, features: Iterable[Hashable]):
"""Update the feature weights."""

def upd_feat(c, f, w, v):
param = (f, c)
self._totals[param] += (self.i - self._tstamps[param]) * w
self._tstamps[param] = self.i
self.weights[f][c] = w + v
def upd_feat(ci: int, fi: int, v: float):
w = self._weights[ci, fi]
self._totals[ci, fi] += (self.i - self._tstamps[ci, fi]) * w
self._tstamps[ci, fi] = self.i
self._weights[ci, fi] += v

self.i += 1
if truth == guess:
return None
truth_i = self._class_to_index[truth]
guess_i = self._class_to_index[guess]
for f in features:
weights = self.weights.setdefault(f, {})
upd_feat(truth, f, weights.get(truth, 0.0), 1.0)
upd_feat(guess, f, weights.get(guess, 0.0), -1.0)
fi = self._feature_to_index[f]
upd_feat(truth_i, fi, 1.0)
upd_feat(guess_i, fi, -1.0)

def average_weights(self):
"""Average weights from all iterations."""
for feat, weights in self.weights.items():
new_feat_weights = {}
for clas, weight in weights.items():
param = (feat, clas)
total = self._totals[param]
total += (self.i - self._tstamps[param]) * weight
averaged = round(total / float(self.i), 3)
if averaged:
new_feat_weights[clas] = averaged
self.weights[feat] = new_feat_weights
for fi, weights in enumerate(self._weights.T):
total = self._totals[:, fi] + (self.i - self._tstamps[:, fi]) * weights
self._weights[:, fi] = numpy.round(total / self.i, 3)


class POSTagger:
Expand Down Expand Up @@ -129,6 +159,7 @@ def __init__(self, *, frequency_threshold=10, ambiguity_threshold=0.95, n_iter=5
self.model = _AveragedPerceptron()
self.tagdict = {}
self.classes = set()
self.features = set()

# HKCanCor doesn't have the Chinese full-width punctuation marks.
self.tagdict.update({punct: punct for punct in _PUNCTUATION_MARKS})
Expand All @@ -147,10 +178,10 @@ def tag(self, words):
list[str]
The list of predicted tags.
"""
prev, prev2 = self.START
tags = []
if not words:
return tags
prev2, prev = self.START
context = self.START + words + self.END
for i, word in enumerate(words):
tag = self.tagdict.get(word)
Expand All @@ -173,32 +204,32 @@ def train(self, tagged_sents, save=None):
If given, save the trained model as a pickle at this path.
"""
self._make_tagdict(tagged_sents)
self.model.classes = self.classes
model = self.model
model.rescope(self.features, self.classes)

prev, prev2 = self.START
for iter_ in range(self.n_iter):
c = 0
n = 0
for tagged_sent in tagged_sents:
prev2, prev = self.START
context = self.START + [w for w, _ in tagged_sent] + self.END
for i, (word, tag) in enumerate(tagged_sent):
try:
guess = self.tagdict[word]
except KeyError:
feats = self._get_features(i, word, context, prev, prev2)
guess = self.model.predict(feats)
self.model.update(tag, guess, feats)
prev2 = prev
prev = guess
guess = model.predict(feats)
model.update(tag, guess, feats)
prev2, prev = prev, guess
c += guess == tag
n += 1
random.shuffle(tagged_sents)
logging.info("Iter %d: %d / %d = %f", iter_, c, n, c / n)
self.model.average_weights()
model.average_weights()

if save is not None:
pickle.dump(
(self.model.weights, self.tagdict, self.classes),
(self.tagdict, model._weights, model.classes, model.features),
open(save, "wb"),
protocol=_PICKLE_PROTOCOL,
)
Expand All @@ -212,7 +243,7 @@ def load(self, path):
The path where the pickled model is located.
"""
try:
w_td_c = pickle.load(open(path, "rb")) # nosec
data = pickle.load(open(path, "rb")) # nosec
except IOError:
raise FileNotFoundError(f"Can't locate tagger model {path}")
except: # noqa
Expand All @@ -226,10 +257,15 @@ def load(self, path):
"In the latter case, please install Git LFS "
"(https://git-lfs.github.com/) and re-install pycantonese."
)
self.model.weights, self.tagdict, self.classes = w_td_c
self.model.classes = self.classes

def _get_features(self, i, word, context, prev, prev2):
self.tagdict, weights, classes, features = data
self.classes = set(classes)
self.features = set(features)
self.model.rescope(features, classes)
self.model._weights = weights
self.model._totals = None
self.model._tstamps = None

def _get_features(self, i, word, context: List[str], prev, prev2):
"""Map tokens into a feature representation, implemented as a
{hashable: float} dict. If the features change, a new model must be
trained.
Expand All @@ -243,24 +279,25 @@ def add(name, *args):

# It's useful to have a constant feature,
# which acts sort of like a prior.
add("bias")
add(_F_BIAS)

add("i word's first char", word[0])
add("i word's final char", word[-1])
add(_F_CUR_WORD_FIRST_CHAR, word[0])
add(_F_CUR_WORD_FINAL_CHAR, word[-1])

add("i-1 word's first char", context[i - 1][0])
add("i-1 word's final char", context[i - 1][-1])
add("i-1 tag", prev)
add(_F_PREV_WORD_FIRST_CHAR, context[i - 1][0])
add(_F_PREV_WORD_FINAL_CHAR, context[i - 1][-1])
add(_F_PREV_TAG, prev)

add("i-2 word's first char", context[i - 2][0])
add("i-2 word's final char", context[i - 2][-1])
add("i-2 tag", prev2)
add(_F_PREV2_WORD_FIRST_CHAR, context[i - 2][0])
add(_F_PREV2_WORD_FINAL_CHAR, context[i - 2][-1])
add(_F_PREV2_TAG, prev2)

add("i+1 word's first char", context[i + 1][0])
add("i+1 word's final char", context[i + 1][-1])
add(_F_NEXT_WORD_FIRST_CHAR, context[i + 1][0])
add(_F_NEXT_WORD_FINAL_CHAR, context[i + 1][-1])

add("i+2 word's first char", context[i + 2][0])
add("i+2 word's final char", context[i + 2][-1])
# Prev impl has copy-paste error.
add(_F_NEXT2_WORD_FIRST_CHAR, context[i + 2][0])
add(_F_NEXT2_WORD_FINAL_CHAR, context[i + 2][-1])

return features

Expand All @@ -280,8 +317,29 @@ def _make_tagdict(self, tagged_sents):
unambiguous = (mode / n) >= self.ambiguity_threshold
if above_freq_threshold and unambiguous:
self.tagdict[word] = tag

self.features.add(_F_BIAS)
for word in words | set(self.START) | set(self.END):
self.features.add(f"{_F_CUR_WORD_FIRST_CHAR} {word[0]}")
self.features.add(f"{_F_CUR_WORD_FINAL_CHAR} {word[-1]}")
self.features.add(f"{_F_PREV_WORD_FIRST_CHAR} {word[0]}")
self.features.add(f"{_F_PREV_WORD_FINAL_CHAR} {word[-1]}")
self.features.add(f"{_F_PREV2_WORD_FIRST_CHAR} {word[0]}")
self.features.add(f"{_F_PREV2_WORD_FINAL_CHAR} {word[-1]}")
self.features.add(f"{_F_NEXT_WORD_FIRST_CHAR} {word[0]}")
self.features.add(f"{_F_NEXT_WORD_FINAL_CHAR} {word[-1]}")
self.features.add(f"{_F_NEXT2_WORD_FIRST_CHAR} {word[0]}")
self.features.add(f"{_F_NEXT2_WORD_FINAL_CHAR} {word[-1]}")
for tag in self.classes:
for prefix in (_F_PREV2_TAG, _F_PREV_TAG):
self.features.add(f"{prefix} {tag}")
self.features.add(f"{_F_PREV2_TAG} {self.START[0]}")
self.features.add(f"{_F_PREV_TAG} {self.START[1]}")
self.features.add(f"{_F_PREV2_TAG} {self.START[1]}")

logging.info("%d unique words in the training data", len(words))
logging.info("%d tags in this tagset", len(self.classes))
logging.info("%d features populated for the training data", len(self.features))
logging.info("%d words are treated as having a unique tag", len(self.tagdict))


Expand Down
4 changes: 3 additions & 1 deletion src/pycantonese/pos_tagging/train_tagger.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""This script trains a part-of-speech tagger."""

import logging
import random

from pycantonese import hkcancor
from pycantonese.pos_tagging import POSTagger
Expand All @@ -10,7 +11,7 @@
_TAGGER_PARAMETERS = {
"frequency_threshold": 10,
"ambiguity_threshold": 0.9,
"n_iter": 10,
"n_iter": 50,
}

# Several POS tags in HKCanCor are odd ones for proper nouns.
Expand Down Expand Up @@ -42,4 +43,5 @@ def _get_tagged_sents():
if __name__ == "__main__":
logging.basicConfig(level="INFO")
tagger = POSTagger(**_TAGGER_PARAMETERS)
random.seed(123456)
tagger.train(_get_tagged_sents(), save=_PICKLE_PATH)
4 changes: 2 additions & 2 deletions tests/test_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,11 @@
),
(
# Unseen "word", so no jyutping in the output
"135",
"136",
None,
None,
None,
"*X: 135\n%mor: X|\n",
"*X: 136\n%mor: X|\n",
),
(
# Custom POS tagging
Expand Down