Skip to content

Commit

Permalink
LUCENE-5752: switch to simpler Automaton implementation
Browse files Browse the repository at this point in the history
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1603752 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
mikemccand committed Jun 19, 2014
1 parent d01f393 commit a82928c
Show file tree
Hide file tree
Showing 69 changed files with 4,610 additions and 3,755 deletions.
4 changes: 4 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,10 @@ Other

(No Changes)

API Changes

* LUCENE-5752: Simplified Automaton API to be immutable. (Mike McCandless)

======================= Lucene 4.9.0 =======================

Changes in Runtime Behavior
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,9 @@
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.BasicOperations;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.State;
import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.automaton.Automaton;

/**
* Compares MockTokenizer (which is simple with no optimizations) with equivalent
Expand All @@ -50,18 +48,18 @@ public class TestDuelingAnalyzers extends BaseTokenStreamTestCase {
@Override
public void setUp() throws Exception {
super.setUp();
Automaton single = new Automaton();
int initial = single.createState();
int accept = single.createState();
single.setAccept(accept, true);

// build an automaton matching this jvm's letter definition
State initial = new State();
State accept = new State();
accept.setAccept(true);
for (int i = 0; i <= 0x10FFFF; i++) {
if (Character.isLetter(i)) {
initial.addTransition(new Transition(i, i, accept));
single.addTransition(initial, accept, i);
}
}
Automaton single = new Automaton(initial);
single.reduce();
Automaton repeat = BasicOperations.repeat(single);
Automaton repeat = Operations.repeat(single);
jvmLetter = new CharacterRunAutomaton(repeat);
}

Expand Down
1 change: 1 addition & 0 deletions lucene/build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,7 @@
<!-- test-framework: problems -->

<!-- too much to fix core/ for now, but enforce full javadocs for key packages -->
<check-missing-javadocs dir="build/docs/core/org/apache/lucene/util/automaton" level="method"/>
<check-missing-javadocs dir="build/docs/core/org/apache/lucene/analysis" level="method"/>
<check-missing-javadocs dir="build/docs/core/org/apache/lucene/document" level="method"/>
<check-missing-javadocs dir="build/docs/core/org/apache/lucene/search/similarities" level="method"/>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@
import org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat; // javadocs
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.OrdTermState;
import org.apache.lucene.index.SegmentReadState;
Expand Down Expand Up @@ -940,10 +940,11 @@ private final class DirectIntersectTermsEnum extends TermsEnum {
private final class State {
int changeOrd;
int state;
Transition[] transitions;
int transitionUpto;
int transitionCount;
int transitionMax;
int transitionMin;
final Transition transition = new Transition();
}

private State[] states;
Expand All @@ -957,7 +958,8 @@ public DirectIntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm)
states[0] = new State();
states[0].changeOrd = terms.length;
states[0].state = runAutomaton.getInitialState();
states[0].transitions = compiledAutomaton.sortedTransitions[states[0].state];
states[0].transitionCount = compiledAutomaton.automaton.getNumTransitions(states[0].state);
compiledAutomaton.automaton.initTransition(states[0].state, states[0].transition);
states[0].transitionUpto = -1;
states[0].transitionMax = -1;

Expand All @@ -978,9 +980,10 @@ public DirectIntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm)

while (label > states[i].transitionMax) {
states[i].transitionUpto++;
assert states[i].transitionUpto < states[i].transitions.length;
states[i].transitionMin = states[i].transitions[states[i].transitionUpto].getMin();
states[i].transitionMax = states[i].transitions[states[i].transitionUpto].getMax();
assert states[i].transitionUpto < states[i].transitionCount;
compiledAutomaton.automaton.getNextTransition(states[i].transition);
states[i].transitionMin = states[i].transition.min;
states[i].transitionMax = states[i].transition.max;
assert states[i].transitionMin >= 0;
assert states[i].transitionMin <= 255;
assert states[i].transitionMax >= 0;
Expand Down Expand Up @@ -1037,7 +1040,8 @@ public DirectIntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm)
stateUpto++;
states[stateUpto].changeOrd = skips[skipOffset + skipUpto++];
states[stateUpto].state = nextState;
states[stateUpto].transitions = compiledAutomaton.sortedTransitions[nextState];
states[stateUpto].transitionCount = compiledAutomaton.automaton.getNumTransitions(nextState);
compiledAutomaton.automaton.initTransition(states[stateUpto].state, states[stateUpto].transition);
states[stateUpto].transitionUpto = -1;
states[stateUpto].transitionMax = -1;
//System.out.println(" push " + states[stateUpto].transitions.length + " trans");
Expand Down Expand Up @@ -1191,7 +1195,7 @@ public BytesRef next() {
while (label > state.transitionMax) {
//System.out.println(" label=" + label + " vs max=" + state.transitionMax + " transUpto=" + state.transitionUpto + " vs " + state.transitions.length);
state.transitionUpto++;
if (state.transitionUpto == state.transitions.length) {
if (state.transitionUpto == state.transitionCount) {
// We've exhausted transitions leaving this
// state; force pop+next/skip now:
//System.out.println("forcepop: stateUpto=" + stateUpto);
Expand All @@ -1210,9 +1214,10 @@ public BytesRef next() {
}
continue nextTerm;
}
assert state.transitionUpto < state.transitions.length: " state.transitionUpto=" + state.transitionUpto + " vs " + state.transitions.length;
state.transitionMin = state.transitions[state.transitionUpto].getMin();
state.transitionMax = state.transitions[state.transitionUpto].getMax();
compiledAutomaton.automaton.getNextTransition(state.transition);
assert state.transitionUpto < state.transitionCount: " state.transitionUpto=" + state.transitionUpto + " vs " + state.transitionCount;
state.transitionMin = state.transition.min;
state.transitionMax = state.transition.max;
assert state.transitionMin >= 0;
assert state.transitionMin <= 255;
assert state.transitionMax >= 0;
Expand Down Expand Up @@ -1310,7 +1315,8 @@ public BytesRef next() {
stateUpto++;
states[stateUpto].state = nextState;
states[stateUpto].changeOrd = skips[skipOffset + skipUpto++];
states[stateUpto].transitions = compiledAutomaton.sortedTransitions[nextState];
states[stateUpto].transitionCount = compiledAutomaton.automaton.getNumTransitions(nextState);
compiledAutomaton.automaton.initTransition(nextState, states[stateUpto].transition);
states[stateUpto].transitionUpto = -1;
states[stateUpto].transitionMax = -1;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,6 @@
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.RollingBuffer;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.State;
import org.apache.lucene.util.automaton.Transition;

// TODO: maybe also toFST? then we can translate atts into FST outputs/weights

Expand Down Expand Up @@ -61,15 +59,15 @@ public void setUnicodeArcs(boolean unicodeArcs) {

private static class Position implements RollingBuffer.Resettable {
// Any tokens that ended at our position arrive to this state:
State arriving;
int arriving = -1;

// Any tokens that start at our position leave from this state:
State leaving;
int leaving = -1;

@Override
public void reset() {
arriving = null;
leaving = null;
arriving = -1;
leaving = -1;
}
}

Expand Down Expand Up @@ -99,8 +97,8 @@ protected BytesRef changeToken(BytesRef in) {
* automaton where arcs are bytes (or Unicode code points
* if unicodeArcs = true) from each term. */
public Automaton toAutomaton(TokenStream in) throws IOException {
final Automaton a = new Automaton();
boolean deterministic = true;
final Automaton.Builder builder = new Automaton.Builder();
builder.createState();

final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
Expand Down Expand Up @@ -132,34 +130,29 @@ public Automaton toAutomaton(TokenStream in) throws IOException {
pos += posInc;

posData = positions.get(pos);
assert posData.leaving == null;
assert posData.leaving == -1;

if (posData.arriving == null) {
if (posData.arriving == -1) {
// No token ever arrived to this position
if (pos == 0) {
// OK: this is the first token
posData.leaving = a.getInitialState();
posData.leaving = 0;
} else {
// This means there's a hole (eg, StopFilter
// does this):
posData.leaving = new State();
addHoles(a.getInitialState(), positions, pos);
posData.leaving = builder.createState();
addHoles(builder, positions, pos);
}
} else {
posData.leaving = new State();
posData.arriving.addTransition(new Transition(POS_SEP, posData.leaving));
posData.leaving = builder.createState();
builder.addTransition(posData.arriving, posData.leaving, POS_SEP);
if (posInc > 1) {
// A token spanned over a hole; add holes
// "under" it:
addHoles(a.getInitialState(), positions, pos);
addHoles(builder, positions, pos);
}
}
positions.freeBefore(pos);
} else {
// note: this isn't necessarily true. its just that we aren't surely det.
// we could optimize this further (e.g. buffer and sort synonyms at a position)
// but thats probably overkill. this is cheap and dirty
deterministic = false;
}

final int endPos = pos + posLengthAtt.getPositionLength();
Expand All @@ -168,60 +161,60 @@ public Automaton toAutomaton(TokenStream in) throws IOException {
final BytesRef termUTF8 = changeToken(term);
int[] termUnicode = null;
final Position endPosData = positions.get(endPos);
if (endPosData.arriving == null) {
endPosData.arriving = new State();
if (endPosData.arriving == -1) {
endPosData.arriving = builder.createState();
}

State state = posData.leaving;
int termLen;
if (unicodeArcs) {
final String utf16 = termUTF8.utf8ToString();
termUnicode = new int[utf16.codePointCount(0, utf16.length())];
termLen = termUnicode.length;
for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp))
for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) {
termUnicode[j++] = cp = utf16.codePointAt(i);
}
} else {
termLen = termUTF8.length;
}

int state = posData.leaving;

for(int byteIDX=0;byteIDX<termLen;byteIDX++) {
final State nextState = byteIDX == termLen-1 ? endPosData.arriving : new State();
final int nextState = byteIDX == termLen-1 ? endPosData.arriving : builder.createState();
int c;
if (unicodeArcs) {
c = termUnicode[byteIDX];
} else {
c = termUTF8.bytes[termUTF8.offset + byteIDX] & 0xff;
}
state.addTransition(new Transition(c, nextState));
builder.addTransition(state, nextState, c);
state = nextState;
}

maxOffset = Math.max(maxOffset, offsetAtt.endOffset());
}

in.end();
State endState = null;
int endState = -1;
if (offsetAtt.endOffset() > maxOffset) {
endState = new State();
endState.setAccept(true);
endState = builder.createState();
builder.setAccept(endState, true);
}

pos++;
while (pos <= positions.getMaxPos()) {
posData = positions.get(pos);
if (posData.arriving != null) {
if (endState != null) {
posData.arriving.addTransition(new Transition(POS_SEP, endState));
if (posData.arriving != -1) {
if (endState != -1) {
builder.addTransition(posData.arriving, endState, POS_SEP);
} else {
posData.arriving.setAccept(true);
builder.setAccept(posData.arriving, true);
}
}
pos++;
}

//toDot(a);
a.setDeterministic(deterministic);
return a;
return builder.finish();
}

// for debugging!
Expand All @@ -235,26 +228,26 @@ private static void toDot(Automaton a) throws IOException {
}
*/

private static void addHoles(State startState, RollingBuffer<Position> positions, int pos) {
private static void addHoles(Automaton.Builder builder, RollingBuffer<Position> positions, int pos) {
Position posData = positions.get(pos);
Position prevPosData = positions.get(pos-1);

while(posData.arriving == null || prevPosData.leaving == null) {
if (posData.arriving == null) {
posData.arriving = new State();
posData.arriving.addTransition(new Transition(POS_SEP, posData.leaving));
while(posData.arriving == -1 || prevPosData.leaving == -1) {
if (posData.arriving == -1) {
posData.arriving = builder.createState();
builder.addTransition(posData.arriving, posData.leaving, POS_SEP);
}
if (prevPosData.leaving == null) {
if (prevPosData.leaving == -1) {
if (pos == 1) {
prevPosData.leaving = startState;
prevPosData.leaving = 0;
} else {
prevPosData.leaving = new State();
prevPosData.leaving = builder.createState();
}
if (prevPosData.arriving != null) {
prevPosData.arriving.addTransition(new Transition(POS_SEP, prevPosData.leaving));
if (prevPosData.arriving != -1) {
builder.addTransition(prevPosData.arriving, prevPosData.leaving, POS_SEP);
}
}
prevPosData.leaving.addTransition(new Transition(HOLE, posData.arriving));
builder.addTransition(prevPosData.leaving, posData.arriving, HOLE);
pos--;
if (pos <= 0) {
break;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,43 +18,25 @@
*/

import java.io.IOException;
import java.io.PrintStream;
import java.util.Collections;
import java.util.Iterator;
import java.util.TreeMap;

import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.RunAutomaton;
import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Outputs;
import org.apache.lucene.util.fst.Util;

/** A block-based terms index and dictionary that assigns
* terms to variable length blocks according to how they
Expand Down
Loading

0 comments on commit a82928c

Please sign in to comment.