diff --git a/Readme.md b/Readme.md index f1e2937..e4ad4a7 100644 --- a/Readme.md +++ b/Readme.md @@ -21,7 +21,7 @@ If you are looking for light-weight versions, VnCoreNLP's word segmentation and ## Installation - `Java 1.8+` (Prerequisite) -- File `VnCoreNLP-1.1.1.jar` (27MB) and folder `models` (115MB) are placed in the same working folder. +- File `VnCoreNLP-1.2.jar` (27MB) and folder `models` (115MB) are placed in the same working folder. - `Python 3.6+` if using [a Python wrapper of VnCoreNLP](https://github.com/thelinhbkhn2014/VnCoreNLP_Wrapper). To install this wrapper, users have to run the following command: `$ pip3 install py_vncorenlp` @@ -38,7 +38,7 @@ import py_vncorenlp # and save them in some local working folder py_vncorenlp.download_model(save_dir='/absolute/path/to/vncorenlp') -# Load VnCoreNLP from the local working folder that contains both `VnCoreNLP-1.1.1.jar` and `models` +# Load VnCoreNLP from the local working folder that contains both `VnCoreNLP-1.2.jar` and `models` model = py_vncorenlp.VnCoreNLP(save_dir='/absolute/path/to/vncorenlp') # Equivalent to: model = py_vncorenlp.VnCoreNLP(annotators=["wseg", "pos", "ner", "parse"], save_dir='/absolute/path/to/vncorenlp') @@ -80,13 +80,13 @@ print(output) You can run VnCoreNLP to annotate an input raw text corpus (e.g. a collection of news content) by using following commands: // To perform word segmentation, POS tagging, NER and then dependency parsing - $ java -Xmx2g -jar VnCoreNLP-1.1.1.jar -fin input.txt -fout output.txt + $ java -Xmx2g -jar VnCoreNLP-1.2.jar -fin input.txt -fout output.txt // To perform word segmentation, POS tagging and then NER - $ java -Xmx2g -jar VnCoreNLP-1.1.1.jar -fin input.txt -fout output.txt -annotators wseg,pos,ner + $ java -Xmx2g -jar VnCoreNLP-1.2.jar -fin input.txt -fout output.txt -annotators wseg,pos,ner // To perform word segmentation and then POS tagging - $ java -Xmx2g -jar VnCoreNLP-1.1.1.jar -fin input.txt -fout output.txt -annotators wseg,pos + $ java -Xmx2g -jar VnCoreNLP-1.2.jar -fin input.txt -fout output.txt -annotators wseg,pos // To perform word segmentation - $ java -Xmx2g -jar VnCoreNLP-1.1.1.jar -fin input.txt -fout output.txt -annotators wseg + $ java -Xmx2g -jar VnCoreNLP-1.2.jar -fin input.txt -fout output.txt -annotators wseg ### Using VnCoreNLP from the API diff --git a/VnCoreNLP-1.2.jar b/VnCoreNLP-1.2.jar new file mode 100644 index 0000000..c3255f2 Binary files /dev/null and b/VnCoreNLP-1.2.jar differ diff --git a/pom.xml b/pom.xml index aaaabbe..0bf5ed6 100755 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ VnCoreNLP VnCoreNLP - 1.1.1 + 1.2 diff --git a/src/main/java/vn/corenlp/ner/NerRecognizer.java b/src/main/java/vn/corenlp/ner/NerRecognizer.java index b32c892..ab74dfa 100644 --- a/src/main/java/vn/corenlp/ner/NerRecognizer.java +++ b/src/main/java/vn/corenlp/ner/NerRecognizer.java @@ -11,6 +11,7 @@ import vn.corenlp.wordsegmenter.Vocabulary; import vn.pipeline.LexicalInitializer; import vn.pipeline.Word; +import vn.pipeline.Utils; import java.io.File; import java.io.IOException; @@ -34,7 +35,7 @@ public NerRecognizer() throws IOException{ nlpDecoder = new NLPDecoder(); List> components = new ArrayList(); - String modelPath = System.getProperty("user.dir") + "/models/ner/vi-ner.xz"; + String modelPath = Utils.jarDir + "/models/ner/vi-ner.xz"; if (!new File(modelPath).exists()) throw new IOException("NerRecognizer: " + modelPath + " is not found!"); GlobalLexica lexica = LexicalInitializer.initialize(true).initializeLexica(); if(lexica != null) { diff --git a/src/main/java/vn/corenlp/parser/DependencyParser.java b/src/main/java/vn/corenlp/parser/DependencyParser.java index b92b38d..488a42a 100755 --- a/src/main/java/vn/corenlp/parser/DependencyParser.java +++ b/src/main/java/vn/corenlp/parser/DependencyParser.java @@ -10,6 +10,7 @@ import org.apache.log4j.Logger; import vn.pipeline.LexicalInitializer; import vn.pipeline.Word; +import vn.pipeline.Utils; import java.io.File; import java.io.IOException; @@ -32,7 +33,7 @@ public DependencyParser() throws IOException { nlpDecoder = new NLPDecoder(); List> components = new ArrayList(); - String modelPath = System.getProperty("user.dir") + "/models/dep/vi-dep.xz"; + String modelPath = Utils.jarDir + "/models/dep/vi-dep.xz"; if (!new File(modelPath).exists()) throw new IOException("DependencyParser: " + modelPath + " is not found!"); GlobalLexica lexica = LexicalInitializer.initialize(true).initializeLexica(); if(lexica != null) { diff --git a/src/main/java/vn/corenlp/postagger/PosTagger.java b/src/main/java/vn/corenlp/postagger/PosTagger.java index f0cafa6..231d3f2 100644 --- a/src/main/java/vn/corenlp/postagger/PosTagger.java +++ b/src/main/java/vn/corenlp/postagger/PosTagger.java @@ -13,13 +13,15 @@ import java.util.LinkedList; import java.util.List; +import vn.pipeline.Utils; + public class PosTagger { private static PosTagger posTagger = null; private MorphTagger tagger; public final static Logger LOGGER = Logger.getLogger(PosTagger.class); public PosTagger() throws IOException { LOGGER.info("Loading POS Tagging model"); - String modelPath = System.getProperty("user.dir") + "/models/postagger/vi-tagger"; + String modelPath = Utils.jarDir + "/models/postagger/vi-tagger"; if (!new File(modelPath).exists()) throw new IOException("PosTagger: " + modelPath + " is not found!"); tagger = FileUtils.loadFromFile(modelPath); diff --git a/src/main/java/vn/corenlp/wordsegmenter/Vocabulary.java b/src/main/java/vn/corenlp/wordsegmenter/Vocabulary.java index bfdead0..aca365c 100755 --- a/src/main/java/vn/corenlp/wordsegmenter/Vocabulary.java +++ b/src/main/java/vn/corenlp/wordsegmenter/Vocabulary.java @@ -7,13 +7,15 @@ import java.util.HashSet; import java.util.Set; +import vn.pipeline.Utils; + @SuppressWarnings("unchecked") public class Vocabulary { public static Set VN_DICT; static { VN_DICT = new HashSet(); try { - String vocabPath = System.getProperty("user.dir") + "/models/wordsegmenter/vi-vocab"; + String vocabPath = Utils.jarDir + "/models/wordsegmenter/vi-vocab"; if (!new File(vocabPath).exists()) throw new IOException("Vocabulary: " + vocabPath + " is not found!"); //Vocabulary.class.getClassLoader().getResource("wordsegmenter/vi-vocab").getPath() diff --git a/src/main/java/vn/corenlp/wordsegmenter/WordSegmenter.java b/src/main/java/vn/corenlp/wordsegmenter/WordSegmenter.java index 6e9bd10..379c86b 100644 --- a/src/main/java/vn/corenlp/wordsegmenter/WordSegmenter.java +++ b/src/main/java/vn/corenlp/wordsegmenter/WordSegmenter.java @@ -22,7 +22,7 @@ public class WordSegmenter { public WordSegmenter() throws IOException { LOGGER.info("Loading Word Segmentation model"); - String modelPath = System.getProperty("user.dir") + "/models/wordsegmenter/wordsegmenter.rdr"; + String modelPath = vn.pipeline.Utils.jarDir + "/models/wordsegmenter/wordsegmenter.rdr"; if (!new File(modelPath).exists()) throw new IOException("WordSegmenter: " + modelPath + " is not found!"); diff --git a/src/main/java/vn/pipeline/LexicalInitializer.java b/src/main/java/vn/pipeline/LexicalInitializer.java index 2574196..0f8f753 100644 --- a/src/main/java/vn/pipeline/LexicalInitializer.java +++ b/src/main/java/vn/pipeline/LexicalInitializer.java @@ -27,12 +27,12 @@ public LexicalInitializer(boolean initLexica) throws IOException { this.initLexica = initLexica; this.lexicalMap = new HashMap<>(); - String lexicalPath = System.getProperty("user.dir") + "/models/ner/vi-500brownclusters.xz"; + String lexicalPath = Utils.jarDir + "/models/ner/vi-500brownclusters.xz"; if (!new File(lexicalPath).exists()) throw new IOException("LexicalInitializer: " + lexicalPath + " is not found!"); lexicalMap.put("word_clusters", lexicalPath); - lexicalPath = System.getProperty("user.dir") + "/models/ner/vi-pretrainedembeddings.xz"; + lexicalPath = Utils.jarDir + "/models/ner/vi-pretrainedembeddings.xz"; if (!new File(lexicalPath).exists()) throw new IOException("LexicalInitializer: " + lexicalPath + " is not found!"); lexicalMap.put("word_embeddings", lexicalPath); diff --git a/src/main/java/vn/pipeline/Utils.java b/src/main/java/vn/pipeline/Utils.java index 641bb11..c4c8f04 100644 --- a/src/main/java/vn/pipeline/Utils.java +++ b/src/main/java/vn/pipeline/Utils.java @@ -6,10 +6,14 @@ import com.optimaize.langdetect.ngram.NgramExtractors; import com.optimaize.langdetect.profiles.LanguageProfileReader; +import java.io.File; import java.io.IOException; import java.util.List; public class Utils { + private static File jarFile = new File(VnCoreNLP.class.getProtectionDomain().getCodeSource().getLocation().getPath()); + public static String jarDir = jarFile.getParentFile().getPath(); + private static LanguageDetector languageDetector = null; public static String detectLanguage(String text) throws IOException{ if(languageDetector == null) { diff --git a/src/main/java/vn/pipeline/VnCoreNLP.java b/src/main/java/vn/pipeline/VnCoreNLP.java index fc6fc0f..c965d58 100644 --- a/src/main/java/vn/pipeline/VnCoreNLP.java +++ b/src/main/java/vn/pipeline/VnCoreNLP.java @@ -14,11 +14,10 @@ public class VnCoreNLP { - private PosTagger posTagger; - public final static Logger LOGGER = Logger.getLogger(Annotation.class); private WordSegmenter wordSegmenter; + private PosTagger posTagger; private NerRecognizer nerRecognizer; private DependencyParser dependencyParser;