jacksonllee · ZhanruiLiang · Oct 14, 2022 · Oct 17, 2022 · Oct 17, 2022 · Oct 17, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/).
 ### Changed
 - Updated versions of the dependencies: `pylangacq >= 0.17.0` and `wordseg >= 0.0.4`.
 - Restructured the repository to use top-level `src/` and `tests/` directories.
+- Rewrite some math logic of tagger using numpy to gain ~3x performance.
 
 ### Deprecated
 ### Removed

diff --git a/pyproject.toml b/pyproject.toml
@@ -25,6 +25,7 @@ keywords = [
 dependencies = [
     'importlib-metadata >= 1.0; python_version < "3.8"',
     'pylangacq >= 0.17.0',
+    'numpy >= 1.23.0',
     'wordseg >= 0.0.4',
 ]
 classifiers = [

diff --git a/src/pycantonese/pos_tagging/tagger.pickle b/src/pycantonese/pos_tagging/tagger.pickle
diff --git a/src/pycantonese/pos_tagging/tagger.py b/src/pycantonese/pos_tagging/tagger.py
@@ -5,7 +5,9 @@
 import pickle  # nosec
 import random
 
-from typing import Dict
+from typing import Dict, Iterable, List, Hashable
+
+import numpy
 
 from pycantonese._punctuation_marks import _PUNCTUATION_MARKS
 from pycantonese.pos_tagging.hkcancor_to_ud import hkcancor_to_ud
@@ -21,6 +23,21 @@
 _THIS_DIR = os.path.dirname(os.path.abspath(__file__))
 _PICKLE_PATH = os.path.join(_THIS_DIR, "tagger.pickle")
 
+# Features prefixes.
+_F_BIAS = "bias"
+_F_CUR_WORD_FIRST_CHAR = "i word's first char"
+_F_CUR_WORD_FINAL_CHAR = "i word's final char"
+_F_PREV_WORD_FIRST_CHAR = "i-1 word's first char"
+_F_PREV_WORD_FINAL_CHAR = "i-1 word's final char"
+_F_PREV_TAG = "i-1 tag"
+_F_PREV2_WORD_FIRST_CHAR = "i-2 word's first char"
+_F_PREV2_WORD_FINAL_CHAR = "i-2 word's final char"
+_F_PREV2_TAG = "i-2 tag"
+_F_NEXT_WORD_FIRST_CHAR = "i+1 word's first char"
+_F_NEXT_WORD_FINAL_CHAR = "i+1 word's final char"
+_F_NEXT2_WORD_FIRST_CHAR = "i+2 word's first char"
+_F_NEXT2_WORD_FINAL_CHAR = "i+2 word's final char"
+
 
 class _AveragedPerceptron:
     """An averaged perceptron.
@@ -31,64 +48,77 @@ class _AveragedPerceptron:
     """
 
     def __init__(self):
-        # Each feature (key) gets its own weight vector (value).
-        self.weights: Dict[str, Dict[str, float]] = {}
-        self.classes = set()
-        # The accumulated values, for the averaging. These will be keyed by
-        # feature/class tuples
-        self._totals = collections.defaultdict(int)
-        # The last time the feature was changed, for the averaging. Also
-        # keyed by feature/class tuples
+        self.classes: List[str] = []
+        self.features: List[Hashable] = []
+        # Maps class/label into row index of the weights matrix.
+        self._class_to_index: Dict[str, int] = {}
+        # Maps feature into column index of the weights matrix.
+        self._feature_to_index: Dict[Hashable, int] = {}
+        # Matrix represented by 2D array of shape (n_classes, n_features).
+        self._weights = numpy.zeros(1)
+
+        # The following attributes are only used for trainning
+
+        # The accumulated values, for the averaging. Has same shape as _weights.
+        self._totals = numpy.zeros(1)
+        # The last iteration the feature was changed, for the averaging.
+        # Has same shape as _weights.
         # (tstamps is short for timestamps)
-        self._tstamps = collections.defaultdict(int)
+        self._tstamps = numpy.zeros(1)
         # Number of instances seen
         self.i = 0
 
-    def predict(self, features):
+    def rescope(self, features: Iterable[Hashable], classes: Iterable[str]):
+        """ "Change the features and classes.
+
+        Assume they won't change until next call.
+        """
+        self.features = list(features)
+        self.classes = sorted(classes)
+        self._weights = numpy.zeros(
+            (len(self.classes), len(self.features)), dtype=numpy.float64
+        )
+        self._totals = self._weights.copy()
+        self._tstamps = numpy.zeros(self._weights.shape, dtype=numpy.int32)
+        self._feature_to_index = {f: i for i, f in enumerate(self.features)}
+        self._class_to_index = {c: i for i, c in enumerate(self.classes)}
+
+    def predict(self, features: Dict[Hashable, float]):
         """Return the best label for the given features.
 
         It's computed based on the dot-product between the features and
         current weights.
         """
-        scores = collections.defaultdict(float)
-        for feat, value in features.items():
-            if feat not in self.weights or value == 0:
-                continue
-            weights = self.weights[feat]
-            for label, weight in weights.items():
-                scores[label] += value * weight
-        # Do a secondary alphabetic sort, for stability
-        return max(self.classes, key=lambda label: (scores[label], label))
-
-    def update(self, truth, guess, features):
+        fs, vs = zip(*(i for i in features.items() if i[0] in self._feature_to_index))
+        # The feature values vector.
+        fvec = numpy.array(vs)
+        weights = self._weights[:, [self._feature_to_index[f] for f in fs]]
+        return self.classes[weights.dot(fvec).argmax()]
+
+    def update(self, truth: str, guess: str, features: Iterable[Hashable]):
         """Update the feature weights."""
 
-        def upd_feat(c, f, w, v):
-            param = (f, c)
-            self._totals[param] += (self.i - self._tstamps[param]) * w
-            self._tstamps[param] = self.i
-            self.weights[f][c] = w + v
+        def upd_feat(ci: int, fi: int, v: float):
+            w = self._weights[ci, fi]
+            self._totals[ci, fi] += (self.i - self._tstamps[ci, fi]) * w
+            self._tstamps[ci, fi] = self.i
+            self._weights[ci, fi] += v
 
         self.i += 1
         if truth == guess:
             return None
+        truth_i = self._class_to_index[truth]
+        guess_i = self._class_to_index[guess]
         for f in features:
-            weights = self.weights.setdefault(f, {})
-            upd_feat(truth, f, weights.get(truth, 0.0), 1.0)
-            upd_feat(guess, f, weights.get(guess, 0.0), -1.0)
+            fi = self._feature_to_index[f]
+            upd_feat(truth_i, fi, 1.0)
+            upd_feat(guess_i, fi, -1.0)
 
     def average_weights(self):
         """Average weights from all iterations."""
-        for feat, weights in self.weights.items():
-            new_feat_weights = {}
-            for clas, weight in weights.items():
-                param = (feat, clas)
-                total = self._totals[param]
-                total += (self.i - self._tstamps[param]) * weight
-                averaged = round(total / float(self.i), 3)
-                if averaged:
-                    new_feat_weights[clas] = averaged
-            self.weights[feat] = new_feat_weights
+        for fi, weights in enumerate(self._weights.T):
+            total = self._totals[:, fi] + (self.i - self._tstamps[:, fi]) * weights
+            self._weights[:, fi] = numpy.round(total / self.i, 3)
 
 
 class POSTagger:
@@ -129,6 +159,7 @@ def __init__(self, *, frequency_threshold=10, ambiguity_threshold=0.95, n_iter=5
         self.model = _AveragedPerceptron()
         self.tagdict = {}
         self.classes = set()
+        self.features = set()
 
         # HKCanCor doesn't have the Chinese full-width punctuation marks.
         self.tagdict.update({punct: punct for punct in _PUNCTUATION_MARKS})
@@ -147,10 +178,10 @@ def tag(self, words):
         list[str]
             The list of predicted tags.
         """
-        prev, prev2 = self.START
         tags = []
         if not words:
             return tags
+        prev2, prev = self.START
         context = self.START + words + self.END
         for i, word in enumerate(words):
             tag = self.tagdict.get(word)
@@ -173,32 +204,32 @@ def train(self, tagged_sents, save=None):
             If given, save the trained model as a pickle at this path.
         """
         self._make_tagdict(tagged_sents)
-        self.model.classes = self.classes
+        model = self.model
+        model.rescope(self.features, self.classes)
 
-        prev, prev2 = self.START
         for iter_ in range(self.n_iter):
             c = 0
             n = 0
             for tagged_sent in tagged_sents:
+                prev2, prev = self.START
                 context = self.START + [w for w, _ in tagged_sent] + self.END
                 for i, (word, tag) in enumerate(tagged_sent):
                     try:
                         guess = self.tagdict[word]
                     except KeyError:
                         feats = self._get_features(i, word, context, prev, prev2)
-                        guess = self.model.predict(feats)
-                        self.model.update(tag, guess, feats)
-                    prev2 = prev
-                    prev = guess
+                        guess = model.predict(feats)
+                        model.update(tag, guess, feats)
+                    prev2, prev = prev, guess
                     c += guess == tag
                     n += 1
             random.shuffle(tagged_sents)
             logging.info("Iter %d: %d / %d = %f", iter_, c, n, c / n)
-        self.model.average_weights()
+        model.average_weights()
 
         if save is not None:
             pickle.dump(
-                (self.model.weights, self.tagdict, self.classes),
+                (self.tagdict, model._weights, model.classes, model.features),
                 open(save, "wb"),
                 protocol=_PICKLE_PROTOCOL,
             )
@@ -212,7 +243,7 @@ def load(self, path):
             The path where the pickled model is located.
         """
         try:
-            w_td_c = pickle.load(open(path, "rb"))  # nosec
+            data = pickle.load(open(path, "rb"))  # nosec
         except IOError:
             raise FileNotFoundError(f"Can't locate tagger model {path}")
         except:  # noqa
@@ -226,10 +257,15 @@ def load(self, path):
                 "In the latter case, please install Git LFS "
                 "(https://git-lfs.github.com/) and re-install pycantonese."
             )
-        self.model.weights, self.tagdict, self.classes = w_td_c
-        self.model.classes = self.classes
-
-    def _get_features(self, i, word, context, prev, prev2):
+        self.tagdict, weights, classes, features = data
+        self.classes = set(classes)
+        self.features = set(features)
+        self.model.rescope(features, classes)
+        self.model._weights = weights
+        self.model._totals = None
+        self.model._tstamps = None
+
+    def _get_features(self, i, word, context: List[str], prev, prev2):
         """Map tokens into a feature representation, implemented as a
         {hashable: float} dict. If the features change, a new model must be
         trained.
@@ -243,24 +279,25 @@ def add(name, *args):
 
         # It's useful to have a constant feature,
         # which acts sort of like a prior.
-        add("bias")
+        add(_F_BIAS)
 
-        add("i word's first char", word[0])
-        add("i word's final char", word[-1])
+        add(_F_CUR_WORD_FIRST_CHAR, word[0])
+        add(_F_CUR_WORD_FINAL_CHAR, word[-1])
 
-        add("i-1 word's first char", context[i - 1][0])
-        add("i-1 word's final char", context[i - 1][-1])
-        add("i-1 tag", prev)
+        add(_F_PREV_WORD_FIRST_CHAR, context[i - 1][0])
+        add(_F_PREV_WORD_FINAL_CHAR, context[i - 1][-1])
+        add(_F_PREV_TAG, prev)
 
-        add("i-2 word's first char", context[i - 2][0])
-        add("i-2 word's final char", context[i - 2][-1])
-        add("i-2 tag", prev2)
+        add(_F_PREV2_WORD_FIRST_CHAR, context[i - 2][0])
+        add(_F_PREV2_WORD_FINAL_CHAR, context[i - 2][-1])
+        add(_F_PREV2_TAG, prev2)
 
-        add("i+1 word's first char", context[i + 1][0])
-        add("i+1 word's final char", context[i + 1][-1])
+        add(_F_NEXT_WORD_FIRST_CHAR, context[i + 1][0])
+        add(_F_NEXT_WORD_FINAL_CHAR, context[i + 1][-1])
 
-        add("i+2 word's first char", context[i + 2][0])
-        add("i+2 word's final char", context[i + 2][-1])
+        # Prev impl has copy-paste error.
+        add(_F_NEXT2_WORD_FIRST_CHAR, context[i + 2][0])
+        add(_F_NEXT2_WORD_FINAL_CHAR, context[i + 2][-1])
 
         return features
 
@@ -280,8 +317,29 @@ def _make_tagdict(self, tagged_sents):
             unambiguous = (mode / n) >= self.ambiguity_threshold
             if above_freq_threshold and unambiguous:
                 self.tagdict[word] = tag
+
+        self.features.add(_F_BIAS)
+        for word in words | set(self.START) | set(self.END):
+            self.features.add(f"{_F_CUR_WORD_FIRST_CHAR} {word[0]}")
+            self.features.add(f"{_F_CUR_WORD_FINAL_CHAR} {word[-1]}")
+            self.features.add(f"{_F_PREV_WORD_FIRST_CHAR} {word[0]}")
+            self.features.add(f"{_F_PREV_WORD_FINAL_CHAR} {word[-1]}")
+            self.features.add(f"{_F_PREV2_WORD_FIRST_CHAR} {word[0]}")
+            self.features.add(f"{_F_PREV2_WORD_FINAL_CHAR} {word[-1]}")
+            self.features.add(f"{_F_NEXT_WORD_FIRST_CHAR} {word[0]}")
+            self.features.add(f"{_F_NEXT_WORD_FINAL_CHAR} {word[-1]}")
+            self.features.add(f"{_F_NEXT2_WORD_FIRST_CHAR} {word[0]}")
+            self.features.add(f"{_F_NEXT2_WORD_FINAL_CHAR} {word[-1]}")
+        for tag in self.classes:
+            for prefix in (_F_PREV2_TAG, _F_PREV_TAG):
+                self.features.add(f"{prefix} {tag}")
+        self.features.add(f"{_F_PREV2_TAG} {self.START[0]}")
+        self.features.add(f"{_F_PREV_TAG} {self.START[1]}")
+        self.features.add(f"{_F_PREV2_TAG} {self.START[1]}")
+
         logging.info("%d unique words in the training data", len(words))
         logging.info("%d tags in this tagset", len(self.classes))
+        logging.info("%d features populated for the training data", len(self.features))
         logging.info("%d words are treated as having a unique tag", len(self.tagdict))
 
 

diff --git a/src/pycantonese/pos_tagging/train_tagger.py b/src/pycantonese/pos_tagging/train_tagger.py
@@ -1,6 +1,7 @@
 """This script trains a part-of-speech tagger."""
 
 import logging
+import random
 
 from pycantonese import hkcancor
 from pycantonese.pos_tagging import POSTagger
@@ -10,7 +11,7 @@
 _TAGGER_PARAMETERS = {
     "frequency_threshold": 10,
     "ambiguity_threshold": 0.9,
-    "n_iter": 10,
+    "n_iter": 50,
 }
 
 # Several POS tags in HKCanCor are odd ones for proper nouns.
@@ -42,4 +43,5 @@ def _get_tagged_sents():
 if __name__ == "__main__":
     logging.basicConfig(level="INFO")
     tagger = POSTagger(**_TAGGER_PARAMETERS)
+    random.seed(123456)
     tagger.train(_get_tagged_sents(), save=_PICKLE_PATH)
diff --git a/tests/test_parsing.py b/tests/test_parsing.py
@@ -27,11 +27,11 @@
         ),
         (
             # Unseen "word", so no jyutping in the output
-            "135",
+            "136",
             None,
             None,
             None,
-            "*X:    135\n%mor:  X|\n",
+            "*X:    136\n%mor:  X|\n",
         ),
         (
             # Custom POS tagging