fix: update for SpaCy 2.2.x

howl-anderson · Mar 12, 2020 · 03d1eea · 03d1eea
1 parent 3db4160
commit 03d1eea
Show file tree

Hide file tree

Showing 22 changed files with 514 additions and 74 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "spacy-dev-resources"]
 	path = spacy-dev-resources
-	url = git@github.com:howl-anderson/spacy-dev-resources.git
+	url = https://github.com/howl-anderson/spacy-dev-resources.git
 [submodule "third-part/brown-cluster"]
 	path = third-part/brown-cluster
-	url = https://github.com/percyliang/brown-cluster.git
+	url = https://github.com/howl-anderson/brown-cluster.git
diff --git a/all_in_one.bash b/all_in_one.bash
@@ -0,0 +1,17 @@
+./create_wikipedia_corpus.bash
+./move_wikipedia_corpus.bash
+./compute_words_freq.bash
+./merge_all_text_files.bash
+./download_and_compile_brown_cluster.bash
+./compute_plain_word_vec.bash
+./create_init_model.bash
+./update_model_meta.py
+./download_UD_Chinese-GSD_corpus.bash
+./extract_UD_Chinese-GSD_corpus.bash
+./convert_UD_Chinese-GSD_corpus.bash
+./format_convertor.bash
+./init_model.bash
+./train_model.bash
+./onto_to_spacy_json.bash
+./train_ner.bash
+./merge_submodel.py
diff --git a/compute_plain_word_vec.bash b/compute_plain_word_vec.bash
@@ -3,4 +3,4 @@
 cpu_count=`nproc --all`
 process_count=$(expr $cpu_count - 1)
 
-python ./spacy-dev-resources/training/plain_word_vectors.py -i 200 -n ${process_count} ../chinese-wikipedia-corpus-creator/token_cleaned_plain_files WORDS_VECS.txt
+python ./spacy-dev-resources/training/plain_word_vectors.py -i 200 -n ${process_count} ./WORDS.txt WORDS_VECS.txt
diff --git a/create_init_model.bash b/create_init_model.bash
@@ -1,3 +1,3 @@
 #!/bin/bash
 
-python -m spacy init-model -c ./WORDS-c1000-p1.out/paths -v WORDS_VECS.txt zh zh_wiki_core WORDS_FREQ.txt
+python -m spacy init-model zh spacy_models/base_model --jsonl-loc ./spacy_corpus.jsonl --vectors-loc WORDS_VECS.txt --vectors-name zh_core_web_sm.vectors
diff --git a/create_jsonl_corpus.bash b/create_jsonl_corpus.bash
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+python ./create_jsonl_vocabulary.py zh spacy_corpus.jsonl WORDS_FREQ.txt ./WORDS-c1000-p1.out/paths
diff --git a/create_jsonl_vocabulary.py b/create_jsonl_vocabulary.py
@@ -0,0 +1,269 @@
+import json
+import math
+import string
+from ast import literal_eval
+from pathlib import Path
+
+import ftfy
+import jsonlines
+import plac
+import validators
+from preshed.counter import PreshCounter
+from spacy.lang.en import stop_words as en_stop_words
+from spacy.lang.zh import stop_words as zh_stop_words
+from tqdm import tqdm
+
+
+class Word:
+    counter = -1
+
+    def __init__(self, word_str, cluster, probs):
+        self._word = word_str
+        self._cluster = cluster
+        self._probs = probs
+
+        chinese_punct = "！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
+        self._punct_list = list(set(string.punctuation + chinese_punct))
+
+        chinese_whitespace = ""
+        self._whitespace_list = list(set(string.whitespace + chinese_whitespace))
+
+        english_stopword = en_stop_words.STOP_WORDS
+        chinese_stopword = zh_stop_words.STOP_WORDS
+        self._stopword_list = {*english_stopword, *chinese_stopword}
+
+        chinese_quote = "“”‘’"
+        english_quote = "\"'"
+        self._qute_list = list(set(english_quote + chinese_quote))
+
+        chinese_left_punct = "<([{"
+        english_left_punct = "＜（［「『【〔〖〘〚｛"
+        self._left_punct_list = list(set(english_left_punct + chinese_left_punct))
+
+        chinese_right_punct = ">)]}"
+        english_right_punct = "＞）］」』】〕〗〙〛｝"
+        self._right_punct_list = list(set(english_right_punct + chinese_right_punct))
+
+    @property
+    def orth(self):
+        return self._word
+
+    @property
+    def id(self):
+        self.__class__.counter += 1
+
+        return self.__class__.counter
+
+    @property
+    def lower(self):
+        return self._word.lower()
+
+    @property
+    def norm(self):
+        return self._word
+
+    @property
+    def shape(self):
+        return "".join(map(lambda x: "X" if x.isupper() else "x", self._word))
+
+    @property
+    def prefix(self):
+        return self._word[0]
+
+    @property
+    def suffix(self):
+        return self._word[-1]
+
+    @property
+    def length(self):
+        return len(self._word)
+
+    @property
+    def cluster(self):
+        return self._cluster
+
+    @property
+    def prob(self):
+        return self._probs.get(self, 0)
+
+    @property
+    def is_alpha(self):
+        return self._word.isalpha()
+
+    @property
+    def is_ascii(self):
+        # only for py 3.7
+        # return self._word.isascii()
+        try:
+            self._word.encode('ascii')
+        except UnicodeEncodeError:
+            return False
+
+        return True
+
+    @property
+    def is_digit(self):
+        return self._word.isdigit()
+
+    @property
+    def is_lower(self):
+        return self._word.islower()
+
+    @property
+    def is_punct(self):
+        return self._word in self._punct_list
+
+    @property
+    def is_space(self):
+        return self._word in self._whitespace_list
+
+    @property
+    def is_title(self):
+        return self._word.istitle()
+
+    @property
+    def is_upper(self):
+        return self._word.isupper()
+
+    @property
+    def like_url(self):
+        return bool(validators.url(self._word))
+
+    @property
+    def like_num(self):
+        # TODO(howl-anderson): fix it later
+        return False
+
+    @property
+    def like_email(self):
+        return bool(validators.email(self._word))
+
+    @property
+    def is_stop(self):
+        return self._word in self._stopword_list
+
+    @property
+    def is_oov(self):
+        return not self._word in self._probs
+
+    @property
+    def is_quote(self):
+        return self._word in self._qute_list
+
+    @property
+    def is_left_punct(self):
+        return self._word in self._left_punct_list
+
+    @property
+    def is_right_punct(self):
+        return self._word in self._right_punct_list
+
+
+def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
+    print("Counting frequencies...")
+    counts = PreshCounter()
+    total = 0
+    with freqs_loc.open() as f:
+        for i, line in enumerate(f):
+            freq, doc_freq, key = line.rstrip().split("\t", 2)
+            freq = int(freq)
+            counts.inc(i + 1, freq)
+            total += freq
+    counts.smooth()
+    log_total = math.log(total)
+    probs = {}
+    with freqs_loc.open() as f:
+        for line in tqdm(f):
+            freq, doc_freq, key = line.rstrip().split("\t", 2)
+            doc_freq = int(doc_freq)
+            freq = int(freq)
+            if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
+                word = literal_eval(key)
+                smooth_count = counts.smoother(int(freq))
+                probs[word] = math.log(smooth_count) - log_total
+    oov_prob = math.log(counts.smoother(0)) - log_total
+    return probs, oov_prob
+
+
+def read_clusters(clusters_loc):
+    print("Reading clusters...")
+    clusters = {}
+    with clusters_loc.open() as f:
+        for line in tqdm(f):
+            try:
+                cluster, word, freq = line.split()
+                word = ftfy.fix_text(word)
+            except ValueError:
+                continue
+            # If the clusterer has only seen the word a few times, its
+            # cluster is unreliable.
+            if int(freq) >= 3:
+                clusters[word] = cluster
+            else:
+                clusters[word] = "0"
+    # Expand clusters with re-casing
+    for word, cluster in list(clusters.items()):
+        if word.lower() not in clusters:
+            clusters[word.lower()] = cluster
+        if word.title() not in clusters:
+            clusters[word.title()] = cluster
+        if word.upper() not in clusters:
+            clusters[word.upper()] = cluster
+    return clusters
+
+
+@plac.annotations(
+    lang=("model language", "positional", None, str),
+    output_loc=("model output directory", "positional", None, str),
+    freqs_loc=("location of words frequencies file", "positional", None, Path),
+    clusters_loc=("location of brown clusters data", "positional", None, Path),
+)
+def main(lang, output_loc, freqs_loc, clusters_loc):
+    clusters = read_clusters(clusters_loc)
+    probs, oov_prob = read_freqs(freqs_loc)
+
+    with jsonlines.open(output_loc, mode="w") as writer:
+        header = {"lang": lang, "settings": {"oov_prob": oov_prob}}
+
+        writer.write(header)
+
+        for word_str, cluster in clusters.items():
+
+            if not word_str:
+                continue
+
+            word = Word(word_str, cluster, probs)
+            row = {
+                "orth": word.orth,  # the word text
+                "id": word.id,  # can correspond to row in vectors table
+                "lower": word.lower,
+                "norm": word.norm,
+                "shape": word.shape,
+                "prefix": word.prefix,
+                "suffix": word.suffix,
+                "length": word.length,
+                "cluster": word.cluster,
+                "prob": word.prob,
+                "is_alpha": word.is_alpha,
+                "is_ascii": word.is_ascii,
+                "is_digit": word.is_digit,
+                "is_lower": word.is_lower,
+                "is_punct": word.is_punct,
+                "is_space": word.is_space,
+                "is_title": word.is_title,
+                "is_upper": word.is_upper,
+                "like_url": word.like_url,
+                "like_num": word.like_num,
+                "like_email": word.like_email,
+                "is_stop": word.is_stop,
+                "is_oov": word.is_oov,
+                "is_quote": word.is_quote,
+                "is_left_punct": word.is_left_punct,
+                "is_right_punct": word.is_right_punct,
+            }
+
+            writer.write(row)
+
+
+if __name__ == "__main__":
+    plac.call(main)
diff --git a/create_model_package.bash b/create_model_package.bash
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+python -m spacy package spacy_models/final_model spacy_models/model_package --force
+
+cd spacy_models/model_package/zh_core_web_sm-0.1.0
+python ./setup.py sdist
+
diff --git a/create_wikipedia_corpus.bash b/create_wikipedia_corpus.bash
@@ -0,0 +1,2 @@
+cd chinese-wikipedia-corpus-creator
+bash ./allinone_process.bash
diff --git a/merge_all_text_files.py b/merge_all_text_files.py
@@ -9,10 +9,10 @@
 output_path = pathlib.Path(output_file)
 
 
-with output_path.open('wt') as outfile:
+with output_path.open("wt") as outfile:
     for fname in input_files:
-        with fname.open('rt') as infile:
+        with fname.open("rt") as infile:
             for line in infile:
-                if not line.endswith('\n'):
-                    line = line + '\n'
+                if not line.endswith("\n"):
+                    line = line + "\n"
                 outfile.write(line)
diff --git a/merge_submodel.py b/merge_submodel.py
@@ -0,0 +1,51 @@
+#!/usr/bin/python3
+
+import shutil
+import json
+from pathlib import Path
+
+
+def read_pipeline(meta_file):
+    with open(meta_file) as fd:
+        data = json.load(fd)
+        return data["pipeline"]
+
+
+def update_pipeline(meta_file, pipeline):
+    with open(meta_file) as fd:
+        data = json.load(fd)
+
+    data["pipeline"] = pipeline
+
+    with open(meta_file, "w") as fd:
+        json.dump(data, fd)
+
+
+def copy_tree(src: Path, dst: Path, folder: str):
+    shutil.copytree(src / folder, dst / folder)
+
+
+def main():
+    target_dir = Path("./spacy_models/final_model")
+    target_dir.mkdir(exist_ok=True)
+
+    pipeline = []
+
+    source_dir = Path("./spacy_models/dependency_model/model-best")
+    copy_tree(source_dir, target_dir, "parser")
+    copy_tree(source_dir, target_dir, "tagger")
+    copy_tree(source_dir, target_dir, "vocab")
+
+    pipeline.extend(read_pipeline(source_dir / "meta.json"))
+
+    source_dir = Path("./spacy_models/ner_model/model-best")
+    copy_tree(source_dir, target_dir, "ner")
+    shutil.copy(source_dir / "meta.json", target_dir / "meta.json")
+
+    pipeline.extend(read_pipeline(source_dir / "meta.json"))
+
+    update_pipeline(target_dir / "meta.json", pipeline)
+
+
+if __name__ == "__main__":
+    main()