Skip to content

Commit

Permalink
bugfix in Onto to SpaCy script
Browse files Browse the repository at this point in the history
  • Loading branch information
howl-anderson committed Aug 5, 2018
1 parent 1e12339 commit 4b10e43
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 5 deletions.
14 changes: 10 additions & 4 deletions onto_to_spacy_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,17 +48,23 @@ def raw_text(text):


def ent_position(ents, text):
search_point = 0
spacy_ents = []
for ent in ents:
ma = re.search(ent[0], text)
ent_tup = (ma.start(), ma.end(), ent[1])
remain_text = text[search_point:]
ma = re.search(ent[0], remain_text)
ent_tup = (ma.start() + search_point, ma.end() + search_point, ent[1])
spacy_ents.append(ent_tup)

# update search point to prevent same word in different entity,
# it will cause bug which hard to debug
search_point = search_point + ma.end()
return spacy_ents


def text_to_spacy(markup):
ents = re.findall("<ENAMEX(.+?)</ENAMEX>", markup)
ents = [clean_ent(ent) for ent in ents]
raw_ents = re.findall("<ENAMEX(.+?)</ENAMEX>", markup)
ents = [clean_ent(raw_ent) for raw_ent in raw_ents]
text = raw_text(markup)
spacy_ents = ent_position(ents, text)
final = (text, {"entities": spacy_ents})
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
plac
spacy
pandas
jieba
2 changes: 1 addition & 1 deletion train_ner.bash
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
#!/bin/bash

python -m spacy train zh ner_model china_ner_train.json china_ner_eval.json --no-tagger --no-parser --vectors ./zh_model
python -m spacy train zh ner_model china_ner_train.json china_ner_eval.json --no-tagger --no-parser -d -g 0 --vectors ./zh_model

0 comments on commit 4b10e43

Please sign in to comment.