Add predicate_token_filter (#33431)

This allows users to filter out tokens from a TokenStream using painless scripts, instead of having to write specialised Java code and packaging it up into a plugin. The commit also refactors the AnalysisPredicateScript.Token class so that it wraps and makes read-only an AttributeSource.
elastic · Sep 11, 2018 · f598297 · f598297
1 parent a55fa4f
commit f598297
Show file tree

Hide file tree

Showing 8 changed files with 341 additions and 44 deletions.
diff --git a/docs/reference/analysis/tokenfilters.asciidoc b/docs/reference/analysis/tokenfilters.asciidoc
@@ -37,6 +37,8 @@ include::tokenfilters/multiplexer-tokenfilter.asciidoc[]
 
 include::tokenfilters/condition-tokenfilter.asciidoc[]
 
+include::tokenfilters/predicate-tokenfilter.asciidoc[]
+
 include::tokenfilters/stemmer-tokenfilter.asciidoc[]
 
 include::tokenfilters/stemmer-override-tokenfilter.asciidoc[]

diff --git a/docs/reference/analysis/tokenfilters/predicate-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/predicate-tokenfilter.asciidoc
@@ -0,0 +1,79 @@
+[[analysis-predicatefilter-tokenfilter]]
+=== Predicate Token Filter Script
+
+The predicate_token_filter token filter takes a predicate script, and removes tokens that do
+not match the predicate.
+
+[float]
+=== Options
+[horizontal]
+script:: a predicate script that determines whether or not the current token will
+be emitted.  Note that only inline scripts are supported.
+
+[float]
+=== Settings example
+
+You can set it up like:
+
+[source,js]
+--------------------------------------------------
+PUT /condition_example
+{
+    "settings" : {
+        "analysis" : {
+            "analyzer" : {
+                "my_analyzer" : {
+                    "tokenizer" : "standard",
+                    "filter" : [ "my_script_filter" ]
+                }
+            },
+            "filter" : {
+                "my_script_filter" : {
+                    "type" : "predicate_token_filter",
+                    "script" : {
+                        "source" : "token.getTerm().length() > 5"  <1>
+                    }
+                }
+            }
+        }
+    }
+}
+--------------------------------------------------
+// CONSOLE
+
+<1> This will emit tokens that are more than 5 characters long
+
+And test it like:
+
+[source,js]
+--------------------------------------------------
+POST /condition_example/_analyze
+{
+  "analyzer" : "my_analyzer",
+  "text" : "What Flapdoodle"
+}
+--------------------------------------------------
+// CONSOLE
+// TEST[continued]
+
+And it'd respond:
+
+[source,js]
+--------------------------------------------------
+{
+  "tokens": [
+    {
+      "token": "Flapdoodle",        <1>
+      "start_offset": 5,
+      "end_offset": 15,
+      "type": "<ALPHANUM>",
+      "position": 1                 <2>
+    }
+  ]
+}
+--------------------------------------------------
+// TESTRESPONSE
+
+<1> The token 'What' has been removed from the tokenstream because it does not
+match the predicate.
+<2> The position and offset values are unaffected by the removal of earlier tokens
diff --git a/...lysis-common/src/main/java/org/elasticsearch/analysis/common/AnalysisPredicateScript.java b/...lysis-common/src/main/java/org/elasticsearch/analysis/common/AnalysisPredicateScript.java
@@ -19,6 +19,13 @@
 
 package org.elasticsearch.analysis.common;
 
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource;
 import org.elasticsearch.script.ScriptContext;
 
 /**
@@ -30,45 +37,64 @@ public abstract class AnalysisPredicateScript {
      * Encapsulation of the state of the current token
      */
     public static class Token {
-        public CharSequence term;
-        public int pos;
-        public int posInc;
-        public int posLen;
-        public int startOffset;
-        public int endOffset;
-        public String type;
-        public boolean isKeyword;
+
+        private final CharTermAttribute termAtt;
+        private final PositionIncrementAttribute posIncAtt;
+        private final PositionLengthAttribute posLenAtt;
+        private final OffsetAttribute offsetAtt;
+        private final TypeAttribute typeAtt;
+        private final KeywordAttribute keywordAtt;
+
+        // posInc is always 1 at the beginning of a tokenstream and the convention
+        // from the _analyze endpoint is that tokenstream positions are 0-based
+        private int pos = -1;
+
+        /**
+         * Create a token exposing values from an AttributeSource
+         */
+        public Token(AttributeSource source) {
+            this.termAtt = source.addAttribute(CharTermAttribute.class);
+            this.posIncAtt = source.addAttribute(PositionIncrementAttribute.class);
+            this.posLenAtt = source.addAttribute(PositionLengthAttribute.class);
+            this.offsetAtt = source.addAttribute(OffsetAttribute.class);
+            this.typeAtt = source.addAttribute(TypeAttribute.class);
+            this.keywordAtt = source.addAttribute(KeywordAttribute.class);
+        }
+
+        public void updatePosition() {
+            this.pos = this.pos + posIncAtt.getPositionIncrement();
+        }
 
         public CharSequence getTerm() {
-            return term;
+            return termAtt;
         }
 
         public int getPositionIncrement() {
-            return posInc;
+            return posIncAtt.getPositionIncrement();
         }
 
         public int getPosition() {
             return pos;
         }
 
         public int getPositionLength() {
-            return posLen;
+            return posLenAtt.getPositionLength();
         }
 
         public int getStartOffset() {
-            return startOffset;
+            return offsetAtt.startOffset();
         }
 
         public int getEndOffset() {
-            return endOffset;
+            return offsetAtt.endOffset();
         }
 
         public String getType() {
-            return type;
+            return typeAtt.type();
         }
 
         public boolean isKeyword() {
-            return isKeyword;
+            return keywordAtt.isKeyword();
         }
     }
 

diff --git a/...analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/...analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java
@@ -264,6 +264,8 @@ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
         filters.put("pattern_replace", requiresAnalysisSettings(PatternReplaceTokenFilterFactory::new));
         filters.put("persian_normalization", PersianNormalizationFilterFactory::new);
         filters.put("porter_stem", PorterStemTokenFilterFactory::new);
+        filters.put("predicate_token_filter",
+            requiresAnalysisSettings((i, e, n, s) -> new PredicateTokenFilterScriptFactory(i, n, s, scriptService.get())));
         filters.put("remove_duplicates", RemoveDuplicatesTokenFilterFactory::new);
         filters.put("reverse", ReverseTokenFilterFactory::new);
         filters.put("russian_stem", RussianStemTokenFilterFactory::new);

diff --git a/...on/src/main/java/org/elasticsearch/analysis/common/PredicateTokenFilterScriptFactory.java b/...on/src/main/java/org/elasticsearch/analysis/common/PredicateTokenFilterScriptFactory.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.analysis.common;
+
+import org.apache.lucene.analysis.FilteringTokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
+import org.elasticsearch.script.Script;
+import org.elasticsearch.script.ScriptService;
+import org.elasticsearch.script.ScriptType;
+
+import java.io.IOException;
+
+/**
+ * A factory for creating FilteringTokenFilters that determine whether or not to
+ * accept their underlying token by consulting a script
+ */
+public class PredicateTokenFilterScriptFactory extends AbstractTokenFilterFactory {
+
+    private final AnalysisPredicateScript.Factory factory;
+
+    public PredicateTokenFilterScriptFactory(IndexSettings indexSettings, String name, Settings settings, ScriptService scriptService) {
+        super(indexSettings, name, settings);
+        Settings scriptSettings = settings.getAsSettings("script");
+        Script script = Script.parse(scriptSettings);
+        if (script.getType() != ScriptType.INLINE) {
+            throw new IllegalArgumentException("Cannot use stored scripts in tokenfilter [" + name + "]");
+        }
+        this.factory = scriptService.compile(script, AnalysisPredicateScript.CONTEXT);
+    }
+
+    @Override
+    public TokenStream create(TokenStream tokenStream) {
+        return new ScriptFilteringTokenFilter(tokenStream, factory.newInstance());
+    }
+
+    private static class ScriptFilteringTokenFilter extends FilteringTokenFilter {
+
+        final AnalysisPredicateScript script;
+        final AnalysisPredicateScript.Token token;
+
+        ScriptFilteringTokenFilter(TokenStream in, AnalysisPredicateScript script) {
+            super(in);
+            this.script = script;
+            this.token = new AnalysisPredicateScript.Token(this);
+        }
+
+        @Override
+        protected boolean accept() throws IOException {
+            token.updatePosition();
+            return script.execute(token);
+        }
+    }
+}
diff --git a/.../src/main/java/org/elasticsearch/analysis/common/ScriptedConditionTokenFilterFactory.java b/.../src/main/java/org/elasticsearch/analysis/common/ScriptedConditionTokenFilterFactory.java
@@ -21,12 +21,6 @@
 
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
-import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
@@ -36,6 +30,7 @@
 import org.elasticsearch.script.ScriptService;
 import org.elasticsearch.script.ScriptType;
 
+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
@@ -76,30 +71,26 @@ public TokenStream create(TokenStream tokenStream) {
             }
             return in;
         };
-        AnalysisPredicateScript script = factory.newInstance();
-        final AnalysisPredicateScript.Token token = new AnalysisPredicateScript.Token();
-        return new ConditionalTokenFilter(tokenStream, filter) {
+        return new ScriptedConditionTokenFilter(tokenStream, filter, factory.newInstance());
+    }
 
-            CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
-            PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
-            PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
-            OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
-            TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
-            KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
+    private static class ScriptedConditionTokenFilter extends ConditionalTokenFilter {
 
-            @Override
-            protected boolean shouldFilter() {
-                token.term = termAtt;
-                token.posInc = posIncAtt.getPositionIncrement();
-                token.pos += token.posInc;
-                token.posLen = posLenAtt.getPositionLength();
-                token.startOffset = offsetAtt.startOffset();
-                token.endOffset = offsetAtt.endOffset();
-                token.type = typeAtt.type();
-                token.isKeyword = keywordAtt.isKeyword();
-                return script.execute(token);
-            }
-        };
+        private final AnalysisPredicateScript script;
+        private final AnalysisPredicateScript.Token token;
+
+        ScriptedConditionTokenFilter(TokenStream input, Function<TokenStream, TokenStream> inputFactory,
+                                               AnalysisPredicateScript script) {
+            super(input, inputFactory);
+            this.script = script;
+            this.token = new AnalysisPredicateScript.Token(this);
+        }
+
+        @Override
+        protected boolean shouldFilter() throws IOException {
+            token.updatePosition();
+            return script.execute(token);
+        }
     }
 
     @Override