-
Notifications
You must be signed in to change notification settings - Fork 110
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
3db4160
commit 03d1eea
Showing
22 changed files
with
514 additions
and
74 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
[submodule "spacy-dev-resources"] | ||
path = spacy-dev-resources | ||
url = git@github.com:howl-anderson/spacy-dev-resources.git | ||
url = https://github.com/howl-anderson/spacy-dev-resources.git | ||
[submodule "third-part/brown-cluster"] | ||
path = third-part/brown-cluster | ||
url = https://github.com/percyliang/brown-cluster.git | ||
url = https://github.com/howl-anderson/brown-cluster.git |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
./create_wikipedia_corpus.bash | ||
./move_wikipedia_corpus.bash | ||
./compute_words_freq.bash | ||
./merge_all_text_files.bash | ||
./download_and_compile_brown_cluster.bash | ||
./compute_plain_word_vec.bash | ||
./create_init_model.bash | ||
./update_model_meta.py | ||
./download_UD_Chinese-GSD_corpus.bash | ||
./extract_UD_Chinese-GSD_corpus.bash | ||
./convert_UD_Chinese-GSD_corpus.bash | ||
./format_convertor.bash | ||
./init_model.bash | ||
./train_model.bash | ||
./onto_to_spacy_json.bash | ||
./train_ner.bash | ||
./merge_submodel.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
#!/bin/bash | ||
|
||
python -m spacy init-model -c ./WORDS-c1000-p1.out/paths -v WORDS_VECS.txt zh zh_wiki_core WORDS_FREQ.txt | ||
python -m spacy init-model zh spacy_models/base_model --jsonl-loc ./spacy_corpus.jsonl --vectors-loc WORDS_VECS.txt --vectors-name zh_core_web_sm.vectors |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
#!/bin/bash | ||
|
||
python ./create_jsonl_vocabulary.py zh spacy_corpus.jsonl WORDS_FREQ.txt ./WORDS-c1000-p1.out/paths |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,269 @@ | ||
import json | ||
import math | ||
import string | ||
from ast import literal_eval | ||
from pathlib import Path | ||
|
||
import ftfy | ||
import jsonlines | ||
import plac | ||
import validators | ||
from preshed.counter import PreshCounter | ||
from spacy.lang.en import stop_words as en_stop_words | ||
from spacy.lang.zh import stop_words as zh_stop_words | ||
from tqdm import tqdm | ||
|
||
|
||
class Word: | ||
counter = -1 | ||
|
||
def __init__(self, word_str, cluster, probs): | ||
self._word = word_str | ||
self._cluster = cluster | ||
self._probs = probs | ||
|
||
chinese_punct = "!?。。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏." | ||
self._punct_list = list(set(string.punctuation + chinese_punct)) | ||
|
||
chinese_whitespace = "" | ||
self._whitespace_list = list(set(string.whitespace + chinese_whitespace)) | ||
|
||
english_stopword = en_stop_words.STOP_WORDS | ||
chinese_stopword = zh_stop_words.STOP_WORDS | ||
self._stopword_list = {*english_stopword, *chinese_stopword} | ||
|
||
chinese_quote = "“”‘’" | ||
english_quote = "\"'" | ||
self._qute_list = list(set(english_quote + chinese_quote)) | ||
|
||
chinese_left_punct = "<([{" | ||
english_left_punct = "<([「『【〔〖〘〚{" | ||
self._left_punct_list = list(set(english_left_punct + chinese_left_punct)) | ||
|
||
chinese_right_punct = ">)]}" | ||
english_right_punct = ">)]」』】〕〗〙〛}" | ||
self._right_punct_list = list(set(english_right_punct + chinese_right_punct)) | ||
|
||
@property | ||
def orth(self): | ||
return self._word | ||
|
||
@property | ||
def id(self): | ||
self.__class__.counter += 1 | ||
|
||
return self.__class__.counter | ||
|
||
@property | ||
def lower(self): | ||
return self._word.lower() | ||
|
||
@property | ||
def norm(self): | ||
return self._word | ||
|
||
@property | ||
def shape(self): | ||
return "".join(map(lambda x: "X" if x.isupper() else "x", self._word)) | ||
|
||
@property | ||
def prefix(self): | ||
return self._word[0] | ||
|
||
@property | ||
def suffix(self): | ||
return self._word[-1] | ||
|
||
@property | ||
def length(self): | ||
return len(self._word) | ||
|
||
@property | ||
def cluster(self): | ||
return self._cluster | ||
|
||
@property | ||
def prob(self): | ||
return self._probs.get(self, 0) | ||
|
||
@property | ||
def is_alpha(self): | ||
return self._word.isalpha() | ||
|
||
@property | ||
def is_ascii(self): | ||
# only for py 3.7 | ||
# return self._word.isascii() | ||
try: | ||
self._word.encode('ascii') | ||
except UnicodeEncodeError: | ||
return False | ||
|
||
return True | ||
|
||
@property | ||
def is_digit(self): | ||
return self._word.isdigit() | ||
|
||
@property | ||
def is_lower(self): | ||
return self._word.islower() | ||
|
||
@property | ||
def is_punct(self): | ||
return self._word in self._punct_list | ||
|
||
@property | ||
def is_space(self): | ||
return self._word in self._whitespace_list | ||
|
||
@property | ||
def is_title(self): | ||
return self._word.istitle() | ||
|
||
@property | ||
def is_upper(self): | ||
return self._word.isupper() | ||
|
||
@property | ||
def like_url(self): | ||
return bool(validators.url(self._word)) | ||
|
||
@property | ||
def like_num(self): | ||
# TODO(howl-anderson): fix it later | ||
return False | ||
|
||
@property | ||
def like_email(self): | ||
return bool(validators.email(self._word)) | ||
|
||
@property | ||
def is_stop(self): | ||
return self._word in self._stopword_list | ||
|
||
@property | ||
def is_oov(self): | ||
return not self._word in self._probs | ||
|
||
@property | ||
def is_quote(self): | ||
return self._word in self._qute_list | ||
|
||
@property | ||
def is_left_punct(self): | ||
return self._word in self._left_punct_list | ||
|
||
@property | ||
def is_right_punct(self): | ||
return self._word in self._right_punct_list | ||
|
||
|
||
def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50): | ||
print("Counting frequencies...") | ||
counts = PreshCounter() | ||
total = 0 | ||
with freqs_loc.open() as f: | ||
for i, line in enumerate(f): | ||
freq, doc_freq, key = line.rstrip().split("\t", 2) | ||
freq = int(freq) | ||
counts.inc(i + 1, freq) | ||
total += freq | ||
counts.smooth() | ||
log_total = math.log(total) | ||
probs = {} | ||
with freqs_loc.open() as f: | ||
for line in tqdm(f): | ||
freq, doc_freq, key = line.rstrip().split("\t", 2) | ||
doc_freq = int(doc_freq) | ||
freq = int(freq) | ||
if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length: | ||
word = literal_eval(key) | ||
smooth_count = counts.smoother(int(freq)) | ||
probs[word] = math.log(smooth_count) - log_total | ||
oov_prob = math.log(counts.smoother(0)) - log_total | ||
return probs, oov_prob | ||
|
||
|
||
def read_clusters(clusters_loc): | ||
print("Reading clusters...") | ||
clusters = {} | ||
with clusters_loc.open() as f: | ||
for line in tqdm(f): | ||
try: | ||
cluster, word, freq = line.split() | ||
word = ftfy.fix_text(word) | ||
except ValueError: | ||
continue | ||
# If the clusterer has only seen the word a few times, its | ||
# cluster is unreliable. | ||
if int(freq) >= 3: | ||
clusters[word] = cluster | ||
else: | ||
clusters[word] = "0" | ||
# Expand clusters with re-casing | ||
for word, cluster in list(clusters.items()): | ||
if word.lower() not in clusters: | ||
clusters[word.lower()] = cluster | ||
if word.title() not in clusters: | ||
clusters[word.title()] = cluster | ||
if word.upper() not in clusters: | ||
clusters[word.upper()] = cluster | ||
return clusters | ||
|
||
|
||
@plac.annotations( | ||
lang=("model language", "positional", None, str), | ||
output_loc=("model output directory", "positional", None, str), | ||
freqs_loc=("location of words frequencies file", "positional", None, Path), | ||
clusters_loc=("location of brown clusters data", "positional", None, Path), | ||
) | ||
def main(lang, output_loc, freqs_loc, clusters_loc): | ||
clusters = read_clusters(clusters_loc) | ||
probs, oov_prob = read_freqs(freqs_loc) | ||
|
||
with jsonlines.open(output_loc, mode="w") as writer: | ||
header = {"lang": lang, "settings": {"oov_prob": oov_prob}} | ||
|
||
writer.write(header) | ||
|
||
for word_str, cluster in clusters.items(): | ||
|
||
if not word_str: | ||
continue | ||
|
||
word = Word(word_str, cluster, probs) | ||
row = { | ||
"orth": word.orth, # the word text | ||
"id": word.id, # can correspond to row in vectors table | ||
"lower": word.lower, | ||
"norm": word.norm, | ||
"shape": word.shape, | ||
"prefix": word.prefix, | ||
"suffix": word.suffix, | ||
"length": word.length, | ||
"cluster": word.cluster, | ||
"prob": word.prob, | ||
"is_alpha": word.is_alpha, | ||
"is_ascii": word.is_ascii, | ||
"is_digit": word.is_digit, | ||
"is_lower": word.is_lower, | ||
"is_punct": word.is_punct, | ||
"is_space": word.is_space, | ||
"is_title": word.is_title, | ||
"is_upper": word.is_upper, | ||
"like_url": word.like_url, | ||
"like_num": word.like_num, | ||
"like_email": word.like_email, | ||
"is_stop": word.is_stop, | ||
"is_oov": word.is_oov, | ||
"is_quote": word.is_quote, | ||
"is_left_punct": word.is_left_punct, | ||
"is_right_punct": word.is_right_punct, | ||
} | ||
|
||
writer.write(row) | ||
|
||
|
||
if __name__ == "__main__": | ||
plac.call(main) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
#!/bin/bash | ||
|
||
python -m spacy package spacy_models/final_model spacy_models/model_package --force | ||
|
||
cd spacy_models/model_package/zh_core_web_sm-0.1.0 | ||
python ./setup.py sdist | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
cd chinese-wikipedia-corpus-creator | ||
bash ./allinone_process.bash |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
#!/usr/bin/python3 | ||
|
||
import shutil | ||
import json | ||
from pathlib import Path | ||
|
||
|
||
def read_pipeline(meta_file): | ||
with open(meta_file) as fd: | ||
data = json.load(fd) | ||
return data["pipeline"] | ||
|
||
|
||
def update_pipeline(meta_file, pipeline): | ||
with open(meta_file) as fd: | ||
data = json.load(fd) | ||
|
||
data["pipeline"] = pipeline | ||
|
||
with open(meta_file, "w") as fd: | ||
json.dump(data, fd) | ||
|
||
|
||
def copy_tree(src: Path, dst: Path, folder: str): | ||
shutil.copytree(src / folder, dst / folder) | ||
|
||
|
||
def main(): | ||
target_dir = Path("./spacy_models/final_model") | ||
target_dir.mkdir(exist_ok=True) | ||
|
||
pipeline = [] | ||
|
||
source_dir = Path("./spacy_models/dependency_model/model-best") | ||
copy_tree(source_dir, target_dir, "parser") | ||
copy_tree(source_dir, target_dir, "tagger") | ||
copy_tree(source_dir, target_dir, "vocab") | ||
|
||
pipeline.extend(read_pipeline(source_dir / "meta.json")) | ||
|
||
source_dir = Path("./spacy_models/ner_model/model-best") | ||
copy_tree(source_dir, target_dir, "ner") | ||
shutil.copy(source_dir / "meta.json", target_dir / "meta.json") | ||
|
||
pipeline.extend(read_pipeline(source_dir / "meta.json")) | ||
|
||
update_pipeline(target_dir / "meta.json", pipeline) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
Oops, something went wrong.