Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LST-7192: Analysis-Kuromoji plugin code changes for ES-5.2.2 #4

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,18 @@ gradle.projectsEvaluated {
allprojects {
apply plugin: 'idea'

repositories {
mavenCentral()
maven {
credentials {
// Put mavenUser=myusername mavenPassword=mypassword (in separate lines) in ~/.gradle/gradle.properties
username "$mavenUser"
password "$mavenPassword"
}
url "https://nqa-nexus.sprinklr.com/nexus/content/repositories/thirdparty/"
}
}

if (isIdea) {
project.buildDir = file('build-idea')
}
Expand Down
3 changes: 2 additions & 1 deletion core/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ archivesBaseName = 'elasticsearch'
dependencies {

// lucene
compile "org.apache.lucene:lucene-core:${versions.lucene}"
compile group: 'com.spr.listening.elasticsearch-5.resources', name: 'lucene-core', version: '6.4.1'
//compile "org.apache.lucene:lucene-core:${versions.lucene}"
compile "org.apache.lucene:lucene-analyzers-common:${versions.lucene}"
compile "org.apache.lucene:lucene-backward-codecs:${versions.lucene}"
compile "org.apache.lucene:lucene-grouping:${versions.lucene}"
Expand Down
28 changes: 22 additions & 6 deletions plugins/analysis-kuromoji/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,31 @@
*/

esplugin {
description 'The Japanese (kuromoji) Analysis plugin integrates Lucene kuromoji analysis module into elasticsearch.'
classname 'org.elasticsearch.plugin.analysis.kuromoji.AnalysisKuromojiPlugin'
description 'The Japanese (kuromoji) Analysis plugin integrates Lucene kuromoji analysis module into elasticsearch.'
classname 'org.elasticsearch.plugin.analysis.kuromoji.AnalysisKuromojiPlugin'
}

versions << [
'aws': '1.10.69',
'jackson': '2.8.6'
]

dependencies {
compile "org.apache.lucene:lucene-analyzers-kuromoji:${versions.lucene}"
}
compile "com.amazonaws:aws-java-sdk-s3:${versions.aws}"
compile "com.amazonaws:aws-java-sdk-kms:${versions.aws}"
compile "com.amazonaws:aws-java-sdk-core:${versions.aws}"
compile "org.apache.httpcomponents:httpclient:${versions.httpclient}"
compile "org.apache.httpcomponents:httpcore:${versions.httpcore}"
compile "commons-logging:commons-logging:${versions.commonslogging}"
compile "commons-codec:commons-codec:${versions.commonscodec}"
compile "com.fasterxml.jackson.core:jackson-databind:${versions.jackson}"
compile "com.fasterxml.jackson.core:jackson-annotations:2.8.2"
compile "com.fasterxml.jackson.core:jackson-core:${versions.jackson}"
compile "com.fasterxml.jackson.dataformat:jackson-dataformat-cbor:${versions.jackson}"

dependencyLicenses {
mapping from: /lucene-.*/, to: 'lucene'
compile group: 'com.spr.listening.elasticsearch-5.resources', name: 'lucene-analyzers-kuromoji-neologd', version: '6.4.1'
}

dependencyLicenses {
mapping from: /lucene-.*/, to: 'lucene'
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
package org.apache.lucene.analysis.ja;

import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.cjk.CJKWidthFilter;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.ja.dict.UserDictionary;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.analysis.KuromojiUserDictionarySyncUtil;

import java.util.Set;

/**
* Created by sumanth on 08/03/17.
*/
public class SprJapaneseAnalyzer extends StopwordAnalyzerBase {
private final JapaneseTokenizer.Mode mode;
private final Set<String> stoptags;
private volatile UserDictionary userDict;

public SprJapaneseAnalyzer(UserDictionary userDict, JapaneseTokenizer.Mode mode, CharArraySet stopwords, Set<String> stoptags, Settings settings) {
super(stopwords);
this.userDict = userDict;
this.mode = mode;
this.stoptags = stoptags;
KuromojiUserDictionarySyncUtil.ensureSyncThread(this, settings);
}

public void setUserDictionary(UserDictionary userDict) {
this.userDict = userDict;
getReuseStrategy().setReusableComponents(this, "ignored", null);
}

@Override
protected Analyzer.TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new JapaneseTokenizer(userDict, false, mode);
TokenStream stream = new JapanesePartOfSpeechStopFilter(tokenizer, stoptags);
stream = new CJKWidthFilter(stream);
stream = new StopFilter(stream, stopwords);
stream = new LowerCaseFilter(stream);
return new TokenStreamComponents(tokenizer, stream);
}

@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
CJKWidthFilter result = new CJKWidthFilter(in);
org.apache.lucene.analysis.LowerCaseFilter result1 = new org.apache.lucene.analysis.LowerCaseFilter(result);
return result1;
}

@Override
public int hashCode() {
return System.identityHashCode(this);
}

@Override
public boolean equals(Object obj) {
return this == obj;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
import org.apache.lucene.analysis.ja.JapaneseTokenizer;
import org.apache.lucene.analysis.ja.SprJapaneseAnalyzer;
import org.apache.lucene.analysis.ja.dict.UserDictionary;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
Expand All @@ -31,20 +32,20 @@

/**
*/
public class KuromojiAnalyzerProvider extends AbstractIndexAnalyzerProvider<JapaneseAnalyzer> {
public class KuromojiAnalyzerProvider extends AbstractIndexAnalyzerProvider<SprJapaneseAnalyzer> {

private final JapaneseAnalyzer analyzer;
private final SprJapaneseAnalyzer analyzer;

public KuromojiAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
final Set<?> stopWords = Analysis.parseStopWords(env, settings, JapaneseAnalyzer.getDefaultStopSet());
final JapaneseTokenizer.Mode mode = KuromojiTokenizerFactory.getMode(settings);
final UserDictionary userDictionary = KuromojiTokenizerFactory.getUserDictionary(env, settings);
analyzer = new JapaneseAnalyzer(userDictionary, mode, CharArraySet.copy(stopWords), JapaneseAnalyzer.getDefaultStopTags());
analyzer = new SprJapaneseAnalyzer(userDictionary, mode, CharArraySet.copy(stopWords), JapaneseAnalyzer.getDefaultStopTags(), env.settings());
}

@Override
public JapaneseAnalyzer get() {
public SprJapaneseAnalyzer get() {
return this.analyzer;
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
package org.elasticsearch.index.analysis;

import com.amazonaws.auth.AWSCredentials;
import com.amazonaws.auth.BasicAWSCredentials;
import com.amazonaws.services.s3.AmazonS3Client;
import com.amazonaws.services.s3.model.GetObjectRequest;
import com.amazonaws.services.s3.model.ObjectMetadata;
import org.apache.commons.codec.Charsets;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.ja.SprJapaneseAnalyzer;
import org.apache.lucene.analysis.ja.dict.UserDictionary;
import org.elasticsearch.common.logging.Loggers;
import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.common.settings.Settings;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Collections;
import java.util.HashSet;
import java.util.Locale;
import java.util.Set;
import java.util.concurrent.TimeUnit;

import static org.elasticsearch.index.analysis.KuromojiUserDictionarySyncUtil.*;

/**
* @author Utkarsh
*/
public class KuromojiDictionarySyncRunnable implements Runnable {

private static final Logger logger = Loggers.getLogger(KuromojiDictionarySyncRunnable.class);

public static final Setting<String> AMAZON_S3_KEY_SETTING = Setting.simpleString("s3.key", Setting.Property.NodeScope);
public static final Setting<String> AMAZON_S3_SECRET_SETTING = Setting.simpleString("s3.secret", Setting.Property.NodeScope);
public static final Setting<String> AMAZON_S3_BUCKET_SETTING = Setting.simpleString("s3.bucket", Setting.Property.NodeScope);
public static final Setting<String> AMAZON_S3_OBJECT_SETTING = Setting.simpleString("s3.object", Setting.Property.NodeScope);

private final Set<SprJapaneseAnalyzer> analyzers = Collections.synchronizedSet(new HashSet<>());
private final AmazonS3Client amazonS3Client;
private long lastSyncTime;
private volatile UserDictionary userDict;

public KuromojiDictionarySyncRunnable(SprJapaneseAnalyzer analyzer, Settings settings) {
initializeSettings(settings);
AWSCredentials credentials = new BasicAWSCredentials(AMAZON_S3_KEY, AMAZON_S3_SECRET);
this.amazonS3Client = new AmazonS3Client(credentials);
this.analyzers.add(analyzer);
this.lastSyncTime = 0;
}

public void addAnalyzer(SprJapaneseAnalyzer analyzer) {
this.analyzers.add(analyzer);
analyzer.setUserDictionary(userDict);
}

@Override
public void run() {
while (true) {
try {
try {
logger.info("Kuromoji dictionary sync triggered");

long s3UpdateTime = getLastS3UpdateTime();
if (s3UpdateTime > lastSyncTime) {
userDict = downloadDictionary();
for (SprJapaneseAnalyzer analyzer : analyzers) {
analyzer.setUserDictionary(userDict);
}
logger.info("Kuromoji dictionary successfully updated for time " + lastSyncTime);
} else {
logger.info("Kuromoji dictionary left untouched");
}

} catch (Throwable t) {
logger.error("An exception occured during kuromoji dictionary refresh : " + t.getMessage(), t);
}

reallySleep(REFRESH_INTERVAL);
} catch (Throwable t) {
logger.error("Unexpected exception occured during kuromoji dictionary refresh : " + t.getMessage(), t);
}
}
}

private UserDictionary downloadDictionary() throws IOException {
GetObjectRequest request = new GetObjectRequest(AMAZON_S3_BUCKET, AMAZON_S3_OBJECT);
File dictionaryFile = null;
UserDictionary dictionary = null;
try {
dictionaryFile = new File(DICTIONARY_FILE);
ObjectMetadata metadata = amazonS3Client.getObject(request, dictionaryFile);
dictionary = readUserDictionary(dictionaryFile);
lastSyncTime = metadata.getLastModified().getTime();
} finally {
try {
dictionaryFile.delete();
} catch (Throwable t) {
// It's okay. You tried.
}
}
return dictionary;
}

private UserDictionary readUserDictionary(File file) throws IOException {
try {
InputStreamReader reader = new InputStreamReader(new FileInputStream(file), Charsets.UTF_8);
return UserDictionary.open(reader);
} catch (IOException var8) {
String message = String.format(Locale.ROOT, "IOException while reading %s : %s", file.getAbsolutePath(), var8.getMessage());
throw new IOException(message, var8);
}
}

private long getLastS3UpdateTime() {
ObjectMetadata metadata = amazonS3Client.getObjectMetadata(AMAZON_S3_BUCKET, AMAZON_S3_OBJECT);
if (metadata != null && metadata.getLastModified() != null) {
return metadata.getLastModified().getTime();
}

return 0;
}

private void reallySleep(long millis) {
boolean threadInterrupted = false;
final long nanos = TimeUnit.MILLISECONDS.toNanos(millis);
final long end = System.nanoTime() + nanos;
long remaining;
try {
do {
remaining = end - System.nanoTime();
if (remaining <= 0) {
break;
}
try {
Thread.sleep(TimeUnit.NANOSECONDS.toMillis(remaining));
} catch (InterruptedException e) {
threadInterrupted = true;
}
} while (remaining > 0);
} finally {
if (threadInterrupted) {
Thread.currentThread().interrupt();
}
}
}

private static void initializeSettings(Settings settings) {
AMAZON_S3_KEY = getSettingValue(AMAZON_S3_KEY_SETTING, settings);
AMAZON_S3_SECRET = getSettingValue(AMAZON_S3_SECRET_SETTING, settings);
AMAZON_S3_BUCKET = getSettingValue(AMAZON_S3_BUCKET_SETTING, settings);
AMAZON_S3_OBJECT = getSettingValue(AMAZON_S3_OBJECT_SETTING, settings);
}

private static String getSettingValue(Setting<String> setting, Settings settings) {
if (!setting.exists(settings)) {
throw new IllegalArgumentException(setting.getKey() + " not set in config file - elasticsearch.yml.");
}
return setting.get(settings);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ public KuromojiTokenizerFactory(IndexSettings indexSettings, Environment env, St
super(indexSettings, name, settings);
mode = getMode(settings);
userDictionary = getUserDictionary(env, settings);
discartPunctuation = settings.getAsBoolean("discard_punctuation", true);
discartPunctuation = settings.getAsBoolean("discard_punctuation", false);
nBestCost = settings.getAsInt(NBEST_COST, -1);
nBestExamples = settings.get(NBEST_EXAMPLES);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package org.elasticsearch.index.analysis;

import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.ja.SprJapaneseAnalyzer;
import org.elasticsearch.common.logging.Loggers;
import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.common.settings.Settings;

import java.util.concurrent.TimeUnit;

/**
* Created by sumanth on 08/03/17.
*/
public class KuromojiUserDictionarySyncUtil {

private static final Logger logger = Loggers.getLogger(KuromojiUserDictionarySyncUtil.class);

public static final String THREAD_NAME = "Sprinklr Kuromoji User Dictionary Sync Thread";


public static String AMAZON_S3_KEY;
public static String AMAZON_S3_SECRET;
public static String AMAZON_S3_BUCKET;
public static String AMAZON_S3_OBJECT;

public static final String DICTIONARY_FILE = "/mnt1/tmp/kuromoji/dictionary.dat";
public static final long REFRESH_INTERVAL = TimeUnit.MINUTES.toMillis(30);

private static volatile KuromojiDictionarySyncRunnable syncRunnable;

public static void ensureSyncThread(SprJapaneseAnalyzer analyzer, Settings settings) {
if (syncRunnable == null) {
synchronized (KuromojiUserDictionarySyncUtil.class) {
if (syncRunnable == null) {
try {
KuromojiDictionarySyncRunnable runnable = new KuromojiDictionarySyncRunnable(analyzer, settings);
Thread syncThread = new Thread(runnable, THREAD_NAME);
syncThread.setDaemon(true);
syncThread.start();
syncRunnable = runnable;
syncRunnable.addAnalyzer(analyzer);
} catch (Throwable t) {
logger.error("Failed to start " + THREAD_NAME, t);
}
} else {
syncRunnable.addAnalyzer(analyzer);
}
}
} else {
syncRunnable.addAnalyzer(analyzer);
}
}
}
Loading