From ab12e3fccab658a5649c9ae9a1c20272b36939c3 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Wed, 5 Sep 2018 17:13:55 +0100 Subject: [PATCH 1/6] Add script_filter tokenfilter --- docs/reference/analysis/tokenfilters.asciidoc | 2 + .../scriptfilter-tokenfilter.asciidoc | 79 ++++++++++++++++ .../common/AnalysisPredicateScript.java | 56 ++++++++---- .../analysis/common/CommonAnalysisPlugin.java | 2 + .../ScriptedConditionTokenFilterFactory.java | 41 ++++----- .../ScriptedFilteringTokenFilterFactory.java | 73 +++++++++++++++ .../ScriptedFilteringTokenFilterTests.java | 89 +++++++++++++++++++ .../analysis-common/60_analysis_scripting.yml | 35 ++++++++ 8 files changed, 340 insertions(+), 37 deletions(-) create mode 100644 docs/reference/analysis/tokenfilters/scriptfilter-tokenfilter.asciidoc create mode 100644 modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScriptedFilteringTokenFilterFactory.java create mode 100644 modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/ScriptedFilteringTokenFilterTests.java diff --git a/docs/reference/analysis/tokenfilters.asciidoc b/docs/reference/analysis/tokenfilters.asciidoc index 5899744247899..76d9ece842eec 100644 --- a/docs/reference/analysis/tokenfilters.asciidoc +++ b/docs/reference/analysis/tokenfilters.asciidoc @@ -39,6 +39,8 @@ include::tokenfilters/multiplexer-tokenfilter.asciidoc[] include::tokenfilters/condition-tokenfilter.asciidoc[] +include::tokenfilters/scriptfilter-tokenfilter.asciidoc[] + include::tokenfilters/stemmer-tokenfilter.asciidoc[] include::tokenfilters/stemmer-override-tokenfilter.asciidoc[] diff --git a/docs/reference/analysis/tokenfilters/scriptfilter-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/scriptfilter-tokenfilter.asciidoc new file mode 100644 index 0000000000000..980bc4addc3ab --- /dev/null +++ b/docs/reference/analysis/tokenfilters/scriptfilter-tokenfilter.asciidoc @@ -0,0 +1,79 @@ +[[analysis-scriptfilter-tokenfilter]] +=== Scripted Filtering Token Filter + +The script_filter token filter takes a predicate script, and removes tokens that do +not match the predicate. + +[float] +=== Options +[horizontal] +script:: a predicate script that determines whether or not the current token will +be emitted. Note that only inline scripts are supported. + +[float] +=== Settings example + +You can set it up like: + +[source,js] +-------------------------------------------------- +PUT /condition_example +{ + "settings" : { + "analysis" : { + "analyzer" : { + "my_analyzer" : { + "tokenizer" : "standard", + "filter" : [ "my_script_filter" ] + } + }, + "filter" : { + "my_script_filter" : { + "type" : "script_filter", + "script" : { + "source" : "token.getTerm().length() < 5" <1> + } + } + } + } + } +} +-------------------------------------------------- +// CONSOLE + +<1> This will skip tokens that are 5 characters long or less + +And test it like: + +[source,js] +-------------------------------------------------- +POST /condition_example/_analyze +{ + "analyzer" : "my_analyzer", + "text" : "What Flapdoodle" +} +-------------------------------------------------- +// CONSOLE +// TEST[continued] + +And it'd respond: + +[source,js] +-------------------------------------------------- +{ + "tokens": [ + { + "token": "Flapdoodle", <1> + "start_offset": 5, + "end_offset": 15, + "type": "", + "position": 1 <2> + } + ] +} +-------------------------------------------------- +// TESTRESPONSE + +<1> The token 'What' has been removed from the tokenstream because it does not +match the predicate. +<2> The position and offset values are unaffected by the removal of earlier tokens \ No newline at end of file diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/AnalysisPredicateScript.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/AnalysisPredicateScript.java index 7de588a958c77..3bda6f393bfdf 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/AnalysisPredicateScript.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/AnalysisPredicateScript.java @@ -19,6 +19,13 @@ package org.elasticsearch.analysis.common; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.util.AttributeSource; import org.elasticsearch.script.ScriptContext; /** @@ -30,21 +37,40 @@ public abstract class AnalysisPredicateScript { * Encapsulation of the state of the current token */ public static class Token { - public CharSequence term; - public int pos; - public int posInc; - public int posLen; - public int startOffset; - public int endOffset; - public String type; - public boolean isKeyword; + + private final CharTermAttribute termAtt; + private final PositionIncrementAttribute posIncAtt; + private final PositionLengthAttribute posLenAtt; + private final OffsetAttribute offsetAtt; + private final TypeAttribute typeAtt; + private final KeywordAttribute keywordAtt; + + // posInc is always 1 at the beginning of a tokenstream and the convention + // from the _analyze endpoint is that tokenstream positions are 0-based + private int pos = -1; + + /** + * Create a token exposing values from an AttributeSource + */ + public Token(AttributeSource source) { + this.termAtt = source.addAttribute(CharTermAttribute.class); + this.posIncAtt = source.addAttribute(PositionIncrementAttribute.class); + this.posLenAtt = source.addAttribute(PositionLengthAttribute.class); + this.offsetAtt = source.addAttribute(OffsetAttribute.class); + this.typeAtt = source.addAttribute(TypeAttribute.class); + this.keywordAtt = source.addAttribute(KeywordAttribute.class); + } + + public void updatePosition() { + this.pos = this.pos + posIncAtt.getPositionIncrement(); + } public CharSequence getTerm() { - return term; + return termAtt; } public int getPositionIncrement() { - return posInc; + return posIncAtt.getPositionIncrement(); } public int getPosition() { @@ -52,23 +78,23 @@ public int getPosition() { } public int getPositionLength() { - return posLen; + return posLenAtt.getPositionLength(); } public int getStartOffset() { - return startOffset; + return offsetAtt.startOffset(); } public int getEndOffset() { - return endOffset; + return offsetAtt.endOffset(); } public String getType() { - return type; + return typeAtt.type(); } public boolean isKeyword() { - return isKeyword; + return keywordAtt.isKeyword(); } } diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index bbd721169c6c7..e3d95b192c1f8 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -271,6 +271,8 @@ public Map> getTokenFilters() { filters.put("russian_stem", RussianStemTokenFilterFactory::new); filters.put("scandinavian_folding", ScandinavianFoldingFilterFactory::new); filters.put("scandinavian_normalization", ScandinavianNormalizationFilterFactory::new); + filters.put("script_filter", + requiresAnalysisSettings((i, e, n, s) -> new ScriptedFilteringTokenFilterFactory(i, n, s, scriptService.get()))); filters.put("serbian_normalization", SerbianNormalizationFilterFactory::new); filters.put("snowball", SnowballTokenFilterFactory::new); filters.put("sorani_normalization", SoraniNormalizationFilterFactory::new); diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScriptedConditionTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScriptedConditionTokenFilterFactory.java index cf7fd5b047a89..76f723fe6e89a 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScriptedConditionTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScriptedConditionTokenFilterFactory.java @@ -36,6 +36,7 @@ import org.elasticsearch.script.ScriptService; import org.elasticsearch.script.ScriptType; +import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; @@ -76,30 +77,26 @@ public TokenStream create(TokenStream tokenStream) { } return in; }; - AnalysisPredicateScript script = factory.newInstance(); - final AnalysisPredicateScript.Token token = new AnalysisPredicateScript.Token(); - return new ConditionalTokenFilter(tokenStream, filter) { + return new ScriptedConditionTokenFilter(tokenStream, filter, factory.newInstance()); + } - CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); - PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class); - OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); - TypeAttribute typeAtt = addAttribute(TypeAttribute.class); - KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class); + private static class ScriptedConditionTokenFilter extends ConditionalTokenFilter { - @Override - protected boolean shouldFilter() { - token.term = termAtt; - token.posInc = posIncAtt.getPositionIncrement(); - token.pos += token.posInc; - token.posLen = posLenAtt.getPositionLength(); - token.startOffset = offsetAtt.startOffset(); - token.endOffset = offsetAtt.endOffset(); - token.type = typeAtt.type(); - token.isKeyword = keywordAtt.isKeyword(); - return script.execute(token); - } - }; + private final AnalysisPredicateScript script; + private final AnalysisPredicateScript.Token token; + + ScriptedConditionTokenFilter(TokenStream input, Function inputFactory, + AnalysisPredicateScript script) { + super(input, inputFactory); + this.script = script; + this.token = new AnalysisPredicateScript.Token(this); + } + + @Override + protected boolean shouldFilter() throws IOException { + token.updatePosition(); + return script.execute(token); + } } @Override diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScriptedFilteringTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScriptedFilteringTokenFilterFactory.java new file mode 100644 index 0000000000000..6ab499e445c33 --- /dev/null +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScriptedFilteringTokenFilterFactory.java @@ -0,0 +1,73 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.analysis.common; + +import org.apache.lucene.analysis.FilteringTokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; +import org.elasticsearch.script.Script; +import org.elasticsearch.script.ScriptService; +import org.elasticsearch.script.ScriptType; + +import java.io.IOException; + +/** + * A factory for creating FilteringTokenFilters that determine whether or not to + * accept their underlying token by consulting a script + */ +public class ScriptedFilteringTokenFilterFactory extends AbstractTokenFilterFactory { + + private final AnalysisPredicateScript.Factory factory; + + public ScriptedFilteringTokenFilterFactory(IndexSettings indexSettings, String name, Settings settings, ScriptService scriptService) { + super(indexSettings, name, settings); + Settings scriptSettings = settings.getAsSettings("script"); + Script script = Script.parse(scriptSettings); + if (script.getType() != ScriptType.INLINE) { + throw new IllegalArgumentException("Cannot use stored scripts in tokenfilter [" + name + "]"); + } + this.factory = scriptService.compile(script, AnalysisPredicateScript.CONTEXT); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new ScriptFilteringTokenFilter(tokenStream, factory.newInstance()); + } + + private static class ScriptFilteringTokenFilter extends FilteringTokenFilter { + + final AnalysisPredicateScript script; + final AnalysisPredicateScript.Token token; + + ScriptFilteringTokenFilter(TokenStream in, AnalysisPredicateScript script) { + super(in); + this.script = script; + this.token = new AnalysisPredicateScript.Token(this); + } + + @Override + protected boolean accept() throws IOException { + token.updatePosition(); + return script.execute(token); + } + } +} diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/ScriptedFilteringTokenFilterTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/ScriptedFilteringTokenFilterTests.java new file mode 100644 index 0000000000000..b8dd124d16b12 --- /dev/null +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/ScriptedFilteringTokenFilterTests.java @@ -0,0 +1,89 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.analysis.common; + +import org.elasticsearch.Version; +import org.elasticsearch.cluster.metadata.IndexMetaData; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.env.TestEnvironment; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.IndexAnalyzers; +import org.elasticsearch.index.analysis.NamedAnalyzer; +import org.elasticsearch.indices.analysis.AnalysisModule; +import org.elasticsearch.script.Script; +import org.elasticsearch.script.ScriptContext; +import org.elasticsearch.script.ScriptService; +import org.elasticsearch.test.ESTokenStreamTestCase; +import org.elasticsearch.test.IndexSettingsModule; + +import java.io.IOException; +import java.util.Collections; + +public class ScriptedFilteringTokenFilterTests extends ESTokenStreamTestCase { + + public void testSimpleFilter() throws IOException { + Settings settings = Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .build(); + Settings indexSettings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) + .put("index.analysis.filter.f.type", "script_filter") + .put("index.analysis.filter.f.script.source", "token.getTerm().length() > 5") + .put("index.analysis.analyzer.myAnalyzer.type", "custom") + .put("index.analysis.analyzer.myAnalyzer.tokenizer", "standard") + .putList("index.analysis.analyzer.myAnalyzer.filter", "f") + .build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); + + AnalysisPredicateScript.Factory factory = () -> new AnalysisPredicateScript() { + @Override + public boolean execute(Token token) { + return token.getTerm().length() > 5; + } + }; + + @SuppressWarnings("unchecked") + ScriptService scriptService = new ScriptService(indexSettings, Collections.emptyMap(), Collections.emptyMap()){ + @Override + public FactoryType compile(Script script, ScriptContext context) { + assertEquals(context, AnalysisPredicateScript.CONTEXT); + assertEquals(new Script("token.getTerm().length() > 5"), script); + return (FactoryType) factory; + } + }; + + CommonAnalysisPlugin plugin = new CommonAnalysisPlugin(); + plugin.createComponents(null, null, null, null, scriptService, null, null, null, null); + AnalysisModule module + = new AnalysisModule(TestEnvironment.newEnvironment(settings), Collections.singletonList(plugin)); + + IndexAnalyzers analyzers = module.getAnalysisRegistry().build(idxSettings); + + try (NamedAnalyzer analyzer = analyzers.get("myAnalyzer")) { + assertNotNull(analyzer); + assertAnalyzesTo(analyzer, "Vorsprung Durch Technik", new String[]{ + "Vorsprung", "Technik" + }); + } + + } + +} diff --git a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/60_analysis_scripting.yml b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/60_analysis_scripting.yml index 4305e5db0af37..446459092998d 100644 --- a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/60_analysis_scripting.yml +++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/60_analysis_scripting.yml @@ -34,3 +34,38 @@ - match: { tokens.0.token: "Vorsprung" } - match: { tokens.1.token: "durch" } - match: { tokens.2.token: "technik" } + +--- +"script_filter": + - do: + indices.analyze: + body: + text: "Vorsprung Durch Technik" + tokenizer: "whitespace" + filter: + - type: script_filter + script: + source: "token.term.length() > 5" + + - length: { tokens: 2 } + - match: { tokens.0.token: "Vorsprung" } + - match: { tokens.1.token: "Technik" } + +--- +"script_filter_position": + - do: + indices.analyze: + body: + text: "a b c d e f g h" + tokenizer: "whitespace" + filter: + - type: script_filter + script: + source: "token.position >= 4" + + - length: { tokens: 4 } + - match: { tokens.0.token: "e" } + - match: { tokens.1.token: "f" } + - match: { tokens.2.token: "g" } + - match: { tokens.3.token: "h" } + From f13f8a542e90a84bf13cefe8d89a7ed46d729a43 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Wed, 5 Sep 2018 17:47:23 +0100 Subject: [PATCH 2/6] Fix docs --- .../analysis/tokenfilters/scriptfilter-tokenfilter.asciidoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/reference/analysis/tokenfilters/scriptfilter-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/scriptfilter-tokenfilter.asciidoc index 980bc4addc3ab..34ee7b80ab106 100644 --- a/docs/reference/analysis/tokenfilters/scriptfilter-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/scriptfilter-tokenfilter.asciidoc @@ -31,7 +31,7 @@ PUT /condition_example "my_script_filter" : { "type" : "script_filter", "script" : { - "source" : "token.getTerm().length() < 5" <1> + "source" : "token.getTerm().length() > 5" <1> } } } From e66ebd70564b07582077a57d49ca8309fa1d6d89 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Thu, 6 Sep 2018 09:02:40 +0100 Subject: [PATCH 3/6] checkstyle --- .../common/ScriptedConditionTokenFilterFactory.java | 6 ------ 1 file changed, 6 deletions(-) diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScriptedConditionTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScriptedConditionTokenFilterFactory.java index 76f723fe6e89a..56f60bb874a5b 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScriptedConditionTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScriptedConditionTokenFilterFactory.java @@ -21,12 +21,6 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; -import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; From e91a3eb38fda10e3dbf19e69453cd5d229207dca Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Fri, 7 Sep 2018 08:12:42 +0100 Subject: [PATCH 4/6] Naming --- docs/reference/analysis/tokenfilters.asciidoc | 2 +- ...nfilter.asciidoc => predicate-tokenfilter.asciidoc} | 10 +++++----- .../analysis/common/CommonAnalysisPlugin.java | 4 ++-- ...ory.java => PredicateTokenFilterScriptFactory.java} | 4 ++-- ...Tests.java => PredicateTokenScriptFilterTests.java} | 4 ++-- .../test/analysis-common/60_analysis_scripting.yml | 4 ++-- 6 files changed, 14 insertions(+), 14 deletions(-) rename docs/reference/analysis/tokenfilters/{scriptfilter-tokenfilter.asciidoc => predicate-tokenfilter.asciidoc} (85%) rename modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/{ScriptedFilteringTokenFilterFactory.java => PredicateTokenFilterScriptFactory.java} (92%) rename modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/{ScriptedFilteringTokenFilterTests.java => PredicateTokenScriptFilterTests.java} (96%) diff --git a/docs/reference/analysis/tokenfilters.asciidoc b/docs/reference/analysis/tokenfilters.asciidoc index 76d9ece842eec..b09a1b9c0b244 100644 --- a/docs/reference/analysis/tokenfilters.asciidoc +++ b/docs/reference/analysis/tokenfilters.asciidoc @@ -39,7 +39,7 @@ include::tokenfilters/multiplexer-tokenfilter.asciidoc[] include::tokenfilters/condition-tokenfilter.asciidoc[] -include::tokenfilters/scriptfilter-tokenfilter.asciidoc[] +include::tokenfilters/predicate-tokenfilter.asciidoc[] include::tokenfilters/stemmer-tokenfilter.asciidoc[] diff --git a/docs/reference/analysis/tokenfilters/scriptfilter-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/predicate-tokenfilter.asciidoc similarity index 85% rename from docs/reference/analysis/tokenfilters/scriptfilter-tokenfilter.asciidoc rename to docs/reference/analysis/tokenfilters/predicate-tokenfilter.asciidoc index 34ee7b80ab106..0a6e2cf1d7148 100644 --- a/docs/reference/analysis/tokenfilters/scriptfilter-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/predicate-tokenfilter.asciidoc @@ -1,7 +1,7 @@ -[[analysis-scriptfilter-tokenfilter]] -=== Scripted Filtering Token Filter +[[analysis-predicatefilter-tokenfilter]] +=== Predicate Token Script Filter -The script_filter token filter takes a predicate script, and removes tokens that do +The predicate_token_filter token filter takes a predicate script, and removes tokens that do not match the predicate. [float] @@ -29,7 +29,7 @@ PUT /condition_example }, "filter" : { "my_script_filter" : { - "type" : "script_filter", + "type" : "predicate_token_filter", "script" : { "source" : "token.getTerm().length() > 5" <1> } @@ -41,7 +41,7 @@ PUT /condition_example -------------------------------------------------- // CONSOLE -<1> This will skip tokens that are 5 characters long or less +<1> This will emit tokens that are more than 5 characters long And test it like: diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index e3d95b192c1f8..e024df1c606ac 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -266,13 +266,13 @@ public Map> getTokenFilters() { filters.put("pattern_replace", requiresAnalysisSettings(PatternReplaceTokenFilterFactory::new)); filters.put("persian_normalization", PersianNormalizationFilterFactory::new); filters.put("porter_stem", PorterStemTokenFilterFactory::new); + filters.put("predicate_token_filter", + requiresAnalysisSettings((i, e, n, s) -> new PredicateTokenFilterScriptFactory(i, n, s, scriptService.get()))); filters.put("remove_duplicates", RemoveDuplicatesTokenFilterFactory::new); filters.put("reverse", ReverseTokenFilterFactory::new); filters.put("russian_stem", RussianStemTokenFilterFactory::new); filters.put("scandinavian_folding", ScandinavianFoldingFilterFactory::new); filters.put("scandinavian_normalization", ScandinavianNormalizationFilterFactory::new); - filters.put("script_filter", - requiresAnalysisSettings((i, e, n, s) -> new ScriptedFilteringTokenFilterFactory(i, n, s, scriptService.get()))); filters.put("serbian_normalization", SerbianNormalizationFilterFactory::new); filters.put("snowball", SnowballTokenFilterFactory::new); filters.put("sorani_normalization", SoraniNormalizationFilterFactory::new); diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScriptedFilteringTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/PredicateTokenFilterScriptFactory.java similarity index 92% rename from modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScriptedFilteringTokenFilterFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/PredicateTokenFilterScriptFactory.java index 6ab499e445c33..84f4bb487060c 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScriptedFilteringTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/PredicateTokenFilterScriptFactory.java @@ -34,11 +34,11 @@ * A factory for creating FilteringTokenFilters that determine whether or not to * accept their underlying token by consulting a script */ -public class ScriptedFilteringTokenFilterFactory extends AbstractTokenFilterFactory { +public class PredicateTokenFilterScriptFactory extends AbstractTokenFilterFactory { private final AnalysisPredicateScript.Factory factory; - public ScriptedFilteringTokenFilterFactory(IndexSettings indexSettings, String name, Settings settings, ScriptService scriptService) { + public PredicateTokenFilterScriptFactory(IndexSettings indexSettings, String name, Settings settings, ScriptService scriptService) { super(indexSettings, name, settings); Settings scriptSettings = settings.getAsSettings("script"); Script script = Script.parse(scriptSettings); diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/ScriptedFilteringTokenFilterTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/PredicateTokenScriptFilterTests.java similarity index 96% rename from modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/ScriptedFilteringTokenFilterTests.java rename to modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/PredicateTokenScriptFilterTests.java index b8dd124d16b12..18afbdcecb3e6 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/ScriptedFilteringTokenFilterTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/PredicateTokenScriptFilterTests.java @@ -37,7 +37,7 @@ import java.io.IOException; import java.util.Collections; -public class ScriptedFilteringTokenFilterTests extends ESTokenStreamTestCase { +public class PredicateTokenScriptFilterTests extends ESTokenStreamTestCase { public void testSimpleFilter() throws IOException { Settings settings = Settings.builder() @@ -45,7 +45,7 @@ public void testSimpleFilter() throws IOException { .build(); Settings indexSettings = Settings.builder() .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) - .put("index.analysis.filter.f.type", "script_filter") + .put("index.analysis.filter.f.type", "predicate_token_filter") .put("index.analysis.filter.f.script.source", "token.getTerm().length() > 5") .put("index.analysis.analyzer.myAnalyzer.type", "custom") .put("index.analysis.analyzer.myAnalyzer.tokenizer", "standard") diff --git a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/60_analysis_scripting.yml b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/60_analysis_scripting.yml index 446459092998d..6f0afd885e659 100644 --- a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/60_analysis_scripting.yml +++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/60_analysis_scripting.yml @@ -43,7 +43,7 @@ text: "Vorsprung Durch Technik" tokenizer: "whitespace" filter: - - type: script_filter + - type: predicate_token_filter script: source: "token.term.length() > 5" @@ -59,7 +59,7 @@ text: "a b c d e f g h" tokenizer: "whitespace" filter: - - type: script_filter + - type: predicate_token_filter script: source: "token.position >= 4" From 8fc81d578713722319e8654b9faaee4d4edc93c9 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Fri, 7 Sep 2018 08:16:04 +0100 Subject: [PATCH 5/6] positions start at 0 --- .../test/analysis-common/60_analysis_scripting.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/60_analysis_scripting.yml b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/60_analysis_scripting.yml index 6f0afd885e659..2015fe31fccb5 100644 --- a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/60_analysis_scripting.yml +++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/60_analysis_scripting.yml @@ -28,7 +28,7 @@ - type: condition filter: [ "lowercase" ] script: - source: "token.position > 1 && token.positionIncrement > 0 && token.startOffset > 0 && token.endOffset > 0 && (token.positionLength == 1 || token.type == \"a\" || token.keyword)" + source: "token.position >= 1 && token.positionIncrement > 0 && token.startOffset > 0 && token.endOffset > 0 && (token.positionLength == 1 || token.type == \"a\" || token.keyword)" - length: { tokens: 3 } - match: { tokens.0.token: "Vorsprung" } From 63c5bd09a3359b0c4d9664268f21d64b6c1e27a1 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Sun, 9 Sep 2018 12:25:19 +0100 Subject: [PATCH 6/6] feedback --- .../analysis/tokenfilters/predicate-tokenfilter.asciidoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/reference/analysis/tokenfilters/predicate-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/predicate-tokenfilter.asciidoc index 0a6e2cf1d7148..bebf7bd80f250 100644 --- a/docs/reference/analysis/tokenfilters/predicate-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/predicate-tokenfilter.asciidoc @@ -1,5 +1,5 @@ [[analysis-predicatefilter-tokenfilter]] -=== Predicate Token Script Filter +=== Predicate Token Filter Script The predicate_token_filter token filter takes a predicate script, and removes tokens that do not match the predicate.