Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add script_filter tokenfilter #33431

Merged
merged 6 commits into from
Sep 11, 2018
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/reference/analysis/tokenfilters.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ include::tokenfilters/multiplexer-tokenfilter.asciidoc[]

include::tokenfilters/condition-tokenfilter.asciidoc[]

include::tokenfilters/scriptfilter-tokenfilter.asciidoc[]

include::tokenfilters/stemmer-tokenfilter.asciidoc[]

include::tokenfilters/stemmer-override-tokenfilter.asciidoc[]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
[[analysis-scriptfilter-tokenfilter]]
=== Scripted Filtering Token Filter

The script_filter token filter takes a predicate script, and removes tokens that do
not match the predicate.

[float]
=== Options
[horizontal]
script:: a predicate script that determines whether or not the current token will
be emitted. Note that only inline scripts are supported.

[float]
=== Settings example

You can set it up like:

[source,js]
--------------------------------------------------
PUT /condition_example
{
"settings" : {
"analysis" : {
"analyzer" : {
"my_analyzer" : {
"tokenizer" : "standard",
"filter" : [ "my_script_filter" ]
}
},
"filter" : {
"my_script_filter" : {
"type" : "script_filter",
"script" : {
"source" : "token.getTerm().length() > 5" <1>
}
}
}
}
}
}
--------------------------------------------------
// CONSOLE

<1> This will skip tokens that are 5 characters long or less
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This description would make more sense with positive logic, since the predicate is positive based. So something like:

<1> This will emit tokens that are more than 5 characters long


And test it like:

[source,js]
--------------------------------------------------
POST /condition_example/_analyze
{
"analyzer" : "my_analyzer",
"text" : "What Flapdoodle"
}
--------------------------------------------------
// CONSOLE
// TEST[continued]

And it'd respond:

[source,js]
--------------------------------------------------
{
"tokens": [
{
"token": "Flapdoodle", <1>
"start_offset": 5,
"end_offset": 15,
"type": "<ALPHANUM>",
"position": 1 <2>
}
]
}
--------------------------------------------------
// TESTRESPONSE

<1> The token 'What' has been removed from the tokenstream because it does not
match the predicate.
<2> The position and offset values are unaffected by the removal of earlier tokens
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,13 @@

package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
import org.elasticsearch.script.ScriptContext;

/**
Expand All @@ -30,45 +37,64 @@ public abstract class AnalysisPredicateScript {
* Encapsulation of the state of the current token
*/
public static class Token {
public CharSequence term;
public int pos;
public int posInc;
public int posLen;
public int startOffset;
public int endOffset;
public String type;
public boolean isKeyword;

private final CharTermAttribute termAtt;
private final PositionIncrementAttribute posIncAtt;
private final PositionLengthAttribute posLenAtt;
private final OffsetAttribute offsetAtt;
private final TypeAttribute typeAtt;
private final KeywordAttribute keywordAtt;

// posInc is always 1 at the beginning of a tokenstream and the convention
// from the _analyze endpoint is that tokenstream positions are 0-based
private int pos = -1;

/**
* Create a token exposing values from an AttributeSource
*/
public Token(AttributeSource source) {
this.termAtt = source.addAttribute(CharTermAttribute.class);
this.posIncAtt = source.addAttribute(PositionIncrementAttribute.class);
this.posLenAtt = source.addAttribute(PositionLengthAttribute.class);
this.offsetAtt = source.addAttribute(OffsetAttribute.class);
this.typeAtt = source.addAttribute(TypeAttribute.class);
this.keywordAtt = source.addAttribute(KeywordAttribute.class);
}

public void updatePosition() {
this.pos = this.pos + posIncAtt.getPositionIncrement();
}

public CharSequence getTerm() {
return term;
return termAtt;
}

public int getPositionIncrement() {
return posInc;
return posIncAtt.getPositionIncrement();
}

public int getPosition() {
return pos;
}

public int getPositionLength() {
return posLen;
return posLenAtt.getPositionLength();
}

public int getStartOffset() {
return startOffset;
return offsetAtt.startOffset();
}

public int getEndOffset() {
return endOffset;
return offsetAtt.endOffset();
}

public String getType() {
return type;
return typeAtt.type();
}

public boolean isKeyword() {
return isKeyword;
return keywordAtt.isKeyword();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,8 @@ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
filters.put("russian_stem", RussianStemTokenFilterFactory::new);
filters.put("scandinavian_folding", ScandinavianFoldingFilterFactory::new);
filters.put("scandinavian_normalization", ScandinavianNormalizationFilterFactory::new);
filters.put("script_filter",
requiresAnalysisSettings((i, e, n, s) -> new ScriptedFilteringTokenFilterFactory(i, n, s, scriptService.get())));
filters.put("serbian_normalization", SerbianNormalizationFilterFactory::new);
filters.put("snowball", SnowballTokenFilterFactory::new);
filters.put("sorani_normalization", SoraniNormalizationFilterFactory::new);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,6 @@

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
Expand All @@ -36,6 +30,7 @@
import org.elasticsearch.script.ScriptService;
import org.elasticsearch.script.ScriptType;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
Expand Down Expand Up @@ -76,30 +71,26 @@ public TokenStream create(TokenStream tokenStream) {
}
return in;
};
AnalysisPredicateScript script = factory.newInstance();
final AnalysisPredicateScript.Token token = new AnalysisPredicateScript.Token();
return new ConditionalTokenFilter(tokenStream, filter) {
return new ScriptedConditionTokenFilter(tokenStream, filter, factory.newInstance());
}

CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
private static class ScriptedConditionTokenFilter extends ConditionalTokenFilter {

@Override
protected boolean shouldFilter() {
token.term = termAtt;
token.posInc = posIncAtt.getPositionIncrement();
token.pos += token.posInc;
token.posLen = posLenAtt.getPositionLength();
token.startOffset = offsetAtt.startOffset();
token.endOffset = offsetAtt.endOffset();
token.type = typeAtt.type();
token.isKeyword = keywordAtt.isKeyword();
return script.execute(token);
}
};
private final AnalysisPredicateScript script;
private final AnalysisPredicateScript.Token token;

ScriptedConditionTokenFilter(TokenStream input, Function<TokenStream, TokenStream> inputFactory,
AnalysisPredicateScript script) {
super(input, inputFactory);
this.script = script;
this.token = new AnalysisPredicateScript.Token(this);
}

@Override
protected boolean shouldFilter() throws IOException {
token.updatePosition();
return script.execute(token);
}
}

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.FilteringTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.script.Script;
import org.elasticsearch.script.ScriptService;
import org.elasticsearch.script.ScriptType;

import java.io.IOException;

/**
* A factory for creating FilteringTokenFilters that determine whether or not to
* accept their underlying token by consulting a script
*/
public class ScriptedFilteringTokenFilterFactory extends AbstractTokenFilterFactory {

private final AnalysisPredicateScript.Factory factory;

public ScriptedFilteringTokenFilterFactory(IndexSettings indexSettings, String name, Settings settings, ScriptService scriptService) {
super(indexSettings, name, settings);
Settings scriptSettings = settings.getAsSettings("script");
Script script = Script.parse(scriptSettings);
if (script.getType() != ScriptType.INLINE) {
throw new IllegalArgumentException("Cannot use stored scripts in tokenfilter [" + name + "]");
}
this.factory = scriptService.compile(script, AnalysisPredicateScript.CONTEXT);
}

@Override
public TokenStream create(TokenStream tokenStream) {
return new ScriptFilteringTokenFilter(tokenStream, factory.newInstance());
}

private static class ScriptFilteringTokenFilter extends FilteringTokenFilter {

final AnalysisPredicateScript script;
final AnalysisPredicateScript.Token token;

ScriptFilteringTokenFilter(TokenStream in, AnalysisPredicateScript script) {
super(in);
this.script = script;
this.token = new AnalysisPredicateScript.Token(this);
}

@Override
protected boolean accept() throws IOException {
token.updatePosition();
return script.execute(token);
}
}
}
Loading