Skip to content

Commit

Permalink
Add predicate_token_filter (#33431)
Browse files Browse the repository at this point in the history
This allows users to filter out tokens from a TokenStream using painless scripts, 
instead of having to write specialised Java code and packaging it up into a plugin.

The commit also refactors the AnalysisPredicateScript.Token class so that it wraps
and makes read-only an AttributeSource.
  • Loading branch information
romseygeek committed Sep 11, 2018
1 parent a55fa4f commit f598297
Show file tree
Hide file tree
Showing 8 changed files with 341 additions and 44 deletions.
2 changes: 2 additions & 0 deletions docs/reference/analysis/tokenfilters.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ include::tokenfilters/multiplexer-tokenfilter.asciidoc[]

include::tokenfilters/condition-tokenfilter.asciidoc[]

include::tokenfilters/predicate-tokenfilter.asciidoc[]

include::tokenfilters/stemmer-tokenfilter.asciidoc[]

include::tokenfilters/stemmer-override-tokenfilter.asciidoc[]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
[[analysis-predicatefilter-tokenfilter]]
=== Predicate Token Filter Script

The predicate_token_filter token filter takes a predicate script, and removes tokens that do
not match the predicate.

[float]
=== Options
[horizontal]
script:: a predicate script that determines whether or not the current token will
be emitted. Note that only inline scripts are supported.

[float]
=== Settings example

You can set it up like:

[source,js]
--------------------------------------------------
PUT /condition_example
{
"settings" : {
"analysis" : {
"analyzer" : {
"my_analyzer" : {
"tokenizer" : "standard",
"filter" : [ "my_script_filter" ]
}
},
"filter" : {
"my_script_filter" : {
"type" : "predicate_token_filter",
"script" : {
"source" : "token.getTerm().length() > 5" <1>
}
}
}
}
}
}
--------------------------------------------------
// CONSOLE

<1> This will emit tokens that are more than 5 characters long

And test it like:

[source,js]
--------------------------------------------------
POST /condition_example/_analyze
{
"analyzer" : "my_analyzer",
"text" : "What Flapdoodle"
}
--------------------------------------------------
// CONSOLE
// TEST[continued]

And it'd respond:

[source,js]
--------------------------------------------------
{
"tokens": [
{
"token": "Flapdoodle", <1>
"start_offset": 5,
"end_offset": 15,
"type": "<ALPHANUM>",
"position": 1 <2>
}
]
}
--------------------------------------------------
// TESTRESPONSE

<1> The token 'What' has been removed from the tokenstream because it does not
match the predicate.
<2> The position and offset values are unaffected by the removal of earlier tokens
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,13 @@

package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
import org.elasticsearch.script.ScriptContext;

/**
Expand All @@ -30,45 +37,64 @@ public abstract class AnalysisPredicateScript {
* Encapsulation of the state of the current token
*/
public static class Token {
public CharSequence term;
public int pos;
public int posInc;
public int posLen;
public int startOffset;
public int endOffset;
public String type;
public boolean isKeyword;

private final CharTermAttribute termAtt;
private final PositionIncrementAttribute posIncAtt;
private final PositionLengthAttribute posLenAtt;
private final OffsetAttribute offsetAtt;
private final TypeAttribute typeAtt;
private final KeywordAttribute keywordAtt;

// posInc is always 1 at the beginning of a tokenstream and the convention
// from the _analyze endpoint is that tokenstream positions are 0-based
private int pos = -1;

/**
* Create a token exposing values from an AttributeSource
*/
public Token(AttributeSource source) {
this.termAtt = source.addAttribute(CharTermAttribute.class);
this.posIncAtt = source.addAttribute(PositionIncrementAttribute.class);
this.posLenAtt = source.addAttribute(PositionLengthAttribute.class);
this.offsetAtt = source.addAttribute(OffsetAttribute.class);
this.typeAtt = source.addAttribute(TypeAttribute.class);
this.keywordAtt = source.addAttribute(KeywordAttribute.class);
}

public void updatePosition() {
this.pos = this.pos + posIncAtt.getPositionIncrement();
}

public CharSequence getTerm() {
return term;
return termAtt;
}

public int getPositionIncrement() {
return posInc;
return posIncAtt.getPositionIncrement();
}

public int getPosition() {
return pos;
}

public int getPositionLength() {
return posLen;
return posLenAtt.getPositionLength();
}

public int getStartOffset() {
return startOffset;
return offsetAtt.startOffset();
}

public int getEndOffset() {
return endOffset;
return offsetAtt.endOffset();
}

public String getType() {
return type;
return typeAtt.type();
}

public boolean isKeyword() {
return isKeyword;
return keywordAtt.isKeyword();
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,8 @@ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
filters.put("pattern_replace", requiresAnalysisSettings(PatternReplaceTokenFilterFactory::new));
filters.put("persian_normalization", PersianNormalizationFilterFactory::new);
filters.put("porter_stem", PorterStemTokenFilterFactory::new);
filters.put("predicate_token_filter",
requiresAnalysisSettings((i, e, n, s) -> new PredicateTokenFilterScriptFactory(i, n, s, scriptService.get())));
filters.put("remove_duplicates", RemoveDuplicatesTokenFilterFactory::new);
filters.put("reverse", ReverseTokenFilterFactory::new);
filters.put("russian_stem", RussianStemTokenFilterFactory::new);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.FilteringTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.script.Script;
import org.elasticsearch.script.ScriptService;
import org.elasticsearch.script.ScriptType;

import java.io.IOException;

/**
* A factory for creating FilteringTokenFilters that determine whether or not to
* accept their underlying token by consulting a script
*/
public class PredicateTokenFilterScriptFactory extends AbstractTokenFilterFactory {

private final AnalysisPredicateScript.Factory factory;

public PredicateTokenFilterScriptFactory(IndexSettings indexSettings, String name, Settings settings, ScriptService scriptService) {
super(indexSettings, name, settings);
Settings scriptSettings = settings.getAsSettings("script");
Script script = Script.parse(scriptSettings);
if (script.getType() != ScriptType.INLINE) {
throw new IllegalArgumentException("Cannot use stored scripts in tokenfilter [" + name + "]");
}
this.factory = scriptService.compile(script, AnalysisPredicateScript.CONTEXT);
}

@Override
public TokenStream create(TokenStream tokenStream) {
return new ScriptFilteringTokenFilter(tokenStream, factory.newInstance());
}

private static class ScriptFilteringTokenFilter extends FilteringTokenFilter {

final AnalysisPredicateScript script;
final AnalysisPredicateScript.Token token;

ScriptFilteringTokenFilter(TokenStream in, AnalysisPredicateScript script) {
super(in);
this.script = script;
this.token = new AnalysisPredicateScript.Token(this);
}

@Override
protected boolean accept() throws IOException {
token.updatePosition();
return script.execute(token);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,6 @@

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
Expand All @@ -36,6 +30,7 @@
import org.elasticsearch.script.ScriptService;
import org.elasticsearch.script.ScriptType;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
Expand Down Expand Up @@ -76,30 +71,26 @@ public TokenStream create(TokenStream tokenStream) {
}
return in;
};
AnalysisPredicateScript script = factory.newInstance();
final AnalysisPredicateScript.Token token = new AnalysisPredicateScript.Token();
return new ConditionalTokenFilter(tokenStream, filter) {
return new ScriptedConditionTokenFilter(tokenStream, filter, factory.newInstance());
}

CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
private static class ScriptedConditionTokenFilter extends ConditionalTokenFilter {

@Override
protected boolean shouldFilter() {
token.term = termAtt;
token.posInc = posIncAtt.getPositionIncrement();
token.pos += token.posInc;
token.posLen = posLenAtt.getPositionLength();
token.startOffset = offsetAtt.startOffset();
token.endOffset = offsetAtt.endOffset();
token.type = typeAtt.type();
token.isKeyword = keywordAtt.isKeyword();
return script.execute(token);
}
};
private final AnalysisPredicateScript script;
private final AnalysisPredicateScript.Token token;

ScriptedConditionTokenFilter(TokenStream input, Function<TokenStream, TokenStream> inputFactory,
AnalysisPredicateScript script) {
super(input, inputFactory);
this.script = script;
this.token = new AnalysisPredicateScript.Token(this);
}

@Override
protected boolean shouldFilter() throws IOException {
token.updatePosition();
return script.execute(token);
}
}

@Override
Expand Down
Loading

0 comments on commit f598297

Please sign in to comment.