diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 2807bc065c7e..dcec52cb94bd 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -98,6 +98,10 @@ Other (No Changes) +API Changes + +* LUCENE-5752: Simplified Automaton API to be immutable. (Mike McCandless) + ======================= Lucene 4.9.0 ======================= Changes in Runtime Behavior diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDuelingAnalyzers.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDuelingAnalyzers.java index 7c4334b72774..8d97f33d7fee 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDuelingAnalyzers.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDuelingAnalyzers.java @@ -31,11 +31,9 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.util.TestUtil; -import org.apache.lucene.util.automaton.Automaton; -import org.apache.lucene.util.automaton.BasicOperations; +import org.apache.lucene.util.automaton.Operations; import org.apache.lucene.util.automaton.CharacterRunAutomaton; -import org.apache.lucene.util.automaton.State; -import org.apache.lucene.util.automaton.Transition; +import org.apache.lucene.util.automaton.Automaton; /** * Compares MockTokenizer (which is simple with no optimizations) with equivalent @@ -50,18 +48,18 @@ public class TestDuelingAnalyzers extends BaseTokenStreamTestCase { @Override public void setUp() throws Exception { super.setUp(); + Automaton single = new Automaton(); + int initial = single.createState(); + int accept = single.createState(); + single.setAccept(accept, true); + // build an automaton matching this jvm's letter definition - State initial = new State(); - State accept = new State(); - accept.setAccept(true); for (int i = 0; i <= 0x10FFFF; i++) { if (Character.isLetter(i)) { - initial.addTransition(new Transition(i, i, accept)); + single.addTransition(initial, accept, i); } } - Automaton single = new Automaton(initial); - single.reduce(); - Automaton repeat = BasicOperations.repeat(single); + Automaton repeat = Operations.repeat(single); jvmLetter = new CharacterRunAutomaton(repeat); } diff --git a/lucene/build.xml b/lucene/build.xml index 39cf296ce455..280800b9fce2 100644 --- a/lucene/build.xml +++ b/lucene/build.xml @@ -258,6 +258,7 @@ + diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java index 149a63d83131..e8d5d62045ee 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java @@ -29,8 +29,8 @@ import org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat; // javadocs import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.DocsEnum; -import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.Fields; import org.apache.lucene.index.OrdTermState; import org.apache.lucene.index.SegmentReadState; @@ -940,10 +940,11 @@ private final class DirectIntersectTermsEnum extends TermsEnum { private final class State { int changeOrd; int state; - Transition[] transitions; int transitionUpto; + int transitionCount; int transitionMax; int transitionMin; + final Transition transition = new Transition(); } private State[] states; @@ -957,7 +958,8 @@ public DirectIntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm) states[0] = new State(); states[0].changeOrd = terms.length; states[0].state = runAutomaton.getInitialState(); - states[0].transitions = compiledAutomaton.sortedTransitions[states[0].state]; + states[0].transitionCount = compiledAutomaton.automaton.getNumTransitions(states[0].state); + compiledAutomaton.automaton.initTransition(states[0].state, states[0].transition); states[0].transitionUpto = -1; states[0].transitionMax = -1; @@ -978,9 +980,10 @@ public DirectIntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm) while (label > states[i].transitionMax) { states[i].transitionUpto++; - assert states[i].transitionUpto < states[i].transitions.length; - states[i].transitionMin = states[i].transitions[states[i].transitionUpto].getMin(); - states[i].transitionMax = states[i].transitions[states[i].transitionUpto].getMax(); + assert states[i].transitionUpto < states[i].transitionCount; + compiledAutomaton.automaton.getNextTransition(states[i].transition); + states[i].transitionMin = states[i].transition.min; + states[i].transitionMax = states[i].transition.max; assert states[i].transitionMin >= 0; assert states[i].transitionMin <= 255; assert states[i].transitionMax >= 0; @@ -1037,7 +1040,8 @@ public DirectIntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm) stateUpto++; states[stateUpto].changeOrd = skips[skipOffset + skipUpto++]; states[stateUpto].state = nextState; - states[stateUpto].transitions = compiledAutomaton.sortedTransitions[nextState]; + states[stateUpto].transitionCount = compiledAutomaton.automaton.getNumTransitions(nextState); + compiledAutomaton.automaton.initTransition(states[stateUpto].state, states[stateUpto].transition); states[stateUpto].transitionUpto = -1; states[stateUpto].transitionMax = -1; //System.out.println(" push " + states[stateUpto].transitions.length + " trans"); @@ -1191,7 +1195,7 @@ public BytesRef next() { while (label > state.transitionMax) { //System.out.println(" label=" + label + " vs max=" + state.transitionMax + " transUpto=" + state.transitionUpto + " vs " + state.transitions.length); state.transitionUpto++; - if (state.transitionUpto == state.transitions.length) { + if (state.transitionUpto == state.transitionCount) { // We've exhausted transitions leaving this // state; force pop+next/skip now: //System.out.println("forcepop: stateUpto=" + stateUpto); @@ -1210,9 +1214,10 @@ public BytesRef next() { } continue nextTerm; } - assert state.transitionUpto < state.transitions.length: " state.transitionUpto=" + state.transitionUpto + " vs " + state.transitions.length; - state.transitionMin = state.transitions[state.transitionUpto].getMin(); - state.transitionMax = state.transitions[state.transitionUpto].getMax(); + compiledAutomaton.automaton.getNextTransition(state.transition); + assert state.transitionUpto < state.transitionCount: " state.transitionUpto=" + state.transitionUpto + " vs " + state.transitionCount; + state.transitionMin = state.transition.min; + state.transitionMax = state.transition.max; assert state.transitionMin >= 0; assert state.transitionMin <= 255; assert state.transitionMax >= 0; @@ -1310,7 +1315,8 @@ public BytesRef next() { stateUpto++; states[stateUpto].state = nextState; states[stateUpto].changeOrd = skips[skipOffset + skipUpto++]; - states[stateUpto].transitions = compiledAutomaton.sortedTransitions[nextState]; + states[stateUpto].transitionCount = compiledAutomaton.automaton.getNumTransitions(nextState); + compiledAutomaton.automaton.initTransition(nextState, states[stateUpto].transition); states[stateUpto].transitionUpto = -1; states[stateUpto].transitionMax = -1; diff --git a/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java b/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java index 2bb3aec57b45..aedaa436182b 100644 --- a/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java +++ b/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java @@ -26,8 +26,6 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.RollingBuffer; import org.apache.lucene.util.automaton.Automaton; -import org.apache.lucene.util.automaton.State; -import org.apache.lucene.util.automaton.Transition; // TODO: maybe also toFST? then we can translate atts into FST outputs/weights @@ -61,15 +59,15 @@ public void setUnicodeArcs(boolean unicodeArcs) { private static class Position implements RollingBuffer.Resettable { // Any tokens that ended at our position arrive to this state: - State arriving; + int arriving = -1; // Any tokens that start at our position leave from this state: - State leaving; + int leaving = -1; @Override public void reset() { - arriving = null; - leaving = null; + arriving = -1; + leaving = -1; } } @@ -99,8 +97,8 @@ protected BytesRef changeToken(BytesRef in) { * automaton where arcs are bytes (or Unicode code points * if unicodeArcs = true) from each term. */ public Automaton toAutomaton(TokenStream in) throws IOException { - final Automaton a = new Automaton(); - boolean deterministic = true; + final Automaton.Builder builder = new Automaton.Builder(); + builder.createState(); final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class); final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class); @@ -132,34 +130,29 @@ public Automaton toAutomaton(TokenStream in) throws IOException { pos += posInc; posData = positions.get(pos); - assert posData.leaving == null; + assert posData.leaving == -1; - if (posData.arriving == null) { + if (posData.arriving == -1) { // No token ever arrived to this position if (pos == 0) { // OK: this is the first token - posData.leaving = a.getInitialState(); + posData.leaving = 0; } else { // This means there's a hole (eg, StopFilter // does this): - posData.leaving = new State(); - addHoles(a.getInitialState(), positions, pos); + posData.leaving = builder.createState(); + addHoles(builder, positions, pos); } } else { - posData.leaving = new State(); - posData.arriving.addTransition(new Transition(POS_SEP, posData.leaving)); + posData.leaving = builder.createState(); + builder.addTransition(posData.arriving, posData.leaving, POS_SEP); if (posInc > 1) { // A token spanned over a hole; add holes // "under" it: - addHoles(a.getInitialState(), positions, pos); + addHoles(builder, positions, pos); } } positions.freeBefore(pos); - } else { - // note: this isn't necessarily true. its just that we aren't surely det. - // we could optimize this further (e.g. buffer and sort synonyms at a position) - // but thats probably overkill. this is cheap and dirty - deterministic = false; } final int endPos = pos + posLengthAtt.getPositionLength(); @@ -168,31 +161,33 @@ public Automaton toAutomaton(TokenStream in) throws IOException { final BytesRef termUTF8 = changeToken(term); int[] termUnicode = null; final Position endPosData = positions.get(endPos); - if (endPosData.arriving == null) { - endPosData.arriving = new State(); + if (endPosData.arriving == -1) { + endPosData.arriving = builder.createState(); } - State state = posData.leaving; int termLen; if (unicodeArcs) { final String utf16 = termUTF8.utf8ToString(); termUnicode = new int[utf16.codePointCount(0, utf16.length())]; termLen = termUnicode.length; - for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) + for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) { termUnicode[j++] = cp = utf16.codePointAt(i); + } } else { termLen = termUTF8.length; } + int state = posData.leaving; + for(int byteIDX=0;byteIDX maxOffset) { - endState = new State(); - endState.setAccept(true); + endState = builder.createState(); + builder.setAccept(endState, true); } pos++; while (pos <= positions.getMaxPos()) { posData = positions.get(pos); - if (posData.arriving != null) { - if (endState != null) { - posData.arriving.addTransition(new Transition(POS_SEP, endState)); + if (posData.arriving != -1) { + if (endState != -1) { + builder.addTransition(posData.arriving, endState, POS_SEP); } else { - posData.arriving.setAccept(true); + builder.setAccept(posData.arriving, true); } } pos++; } - //toDot(a); - a.setDeterministic(deterministic); - return a; + return builder.finish(); } // for debugging! @@ -235,26 +228,26 @@ private static void toDot(Automaton a) throws IOException { } */ - private static void addHoles(State startState, RollingBuffer positions, int pos) { + private static void addHoles(Automaton.Builder builder, RollingBuffer positions, int pos) { Position posData = positions.get(pos); Position prevPosData = positions.get(pos-1); - while(posData.arriving == null || prevPosData.leaving == null) { - if (posData.arriving == null) { - posData.arriving = new State(); - posData.arriving.addTransition(new Transition(POS_SEP, posData.leaving)); + while(posData.arriving == -1 || prevPosData.leaving == -1) { + if (posData.arriving == -1) { + posData.arriving = builder.createState(); + builder.addTransition(posData.arriving, posData.leaving, POS_SEP); } - if (prevPosData.leaving == null) { + if (prevPosData.leaving == -1) { if (pos == 1) { - prevPosData.leaving = startState; + prevPosData.leaving = 0; } else { - prevPosData.leaving = new State(); + prevPosData.leaving = builder.createState(); } - if (prevPosData.arriving != null) { - prevPosData.arriving.addTransition(new Transition(POS_SEP, prevPosData.leaving)); + if (prevPosData.arriving != -1) { + builder.addTransition(prevPosData.arriving, prevPosData.leaving, POS_SEP); } } - prevPosData.leaving.addTransition(new Transition(HOLE, posData.arriving)); + builder.addTransition(prevPosData.leaving, posData.arriving, HOLE); pos--; if (pos <= 0) { break; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java index 5ef41bc45963..91d2abe66d8e 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java @@ -18,43 +18,25 @@ */ import java.io.IOException; -import java.io.PrintStream; import java.util.Collections; import java.util.Iterator; import java.util.TreeMap; -import org.apache.lucene.codecs.BlockTermState; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.FieldsProducer; import org.apache.lucene.codecs.PostingsReaderBase; import org.apache.lucene.index.CorruptIndexException; -import org.apache.lucene.index.DocsAndPositionsEnum; -import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentInfo; -import org.apache.lucene.index.TermState; import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; -import org.apache.lucene.util.RamUsageEstimator; -import org.apache.lucene.util.StringHelper; -import org.apache.lucene.util.automaton.CompiledAutomaton; -import org.apache.lucene.util.automaton.RunAutomaton; -import org.apache.lucene.util.automaton.Transition; -import org.apache.lucene.util.fst.ByteSequenceOutputs; -import org.apache.lucene.util.fst.FST; -import org.apache.lucene.util.fst.Outputs; -import org.apache.lucene.util.fst.Util; /** A block-based terms index and dictionary that assigns * terms to variable length blocks according to how they diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnum.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnum.java index e910ed0db678..a7a569b20b56 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnum.java @@ -347,7 +347,7 @@ public BytesRef next() throws IOException { if (currentFrame.suffix != 0) { final int label = currentFrame.suffixBytes[currentFrame.startBytePos] & 0xff; while (label > currentFrame.curTransitionMax) { - if (currentFrame.transitionIndex >= currentFrame.transitions.length-1) { + if (currentFrame.transitionIndex >= currentFrame.transitionCount-1) { // Stop processing this frame -- no further // matches are possible because we've moved // beyond what the max transition will allow @@ -359,7 +359,8 @@ public BytesRef next() throws IOException { continue nextTerm; } currentFrame.transitionIndex++; - currentFrame.curTransitionMax = currentFrame.transitions[currentFrame.transitionIndex].getMax(); + compiledAutomaton.automaton.getNextTransition(currentFrame.transition); + currentFrame.curTransitionMax = currentFrame.transition.max; //if (DEBUG) System.out.println(" next trans=" + currentFrame.transitions[currentFrame.transitionIndex]); } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnumFrame.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnumFrame.java index a39f74bd1124..e676228135c3 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnumFrame.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnumFrame.java @@ -68,9 +68,10 @@ final class IntersectTermsEnumFrame { int numFollowFloorBlocks; int nextFloorLabel; - Transition[] transitions; + Transition transition = new Transition(); int curTransitionMax; int transitionIndex; + int transitionCount; FST.Arc arc; @@ -112,7 +113,7 @@ void loadNextFloorBlock() throws IOException { nextFloorLabel = 256; } // if (DEBUG) System.out.println(" nextFloorLabel=" + (char) nextFloorLabel); - } while (numFollowFloorBlocks != 0 && nextFloorLabel <= transitions[transitionIndex].getMin()); + } while (numFollowFloorBlocks != 0 && nextFloorLabel <= transition.min); load(null); } @@ -120,9 +121,11 @@ void loadNextFloorBlock() throws IOException { public void setState(int state) { this.state = state; transitionIndex = 0; - transitions = ite.compiledAutomaton.sortedTransitions[state]; - if (transitions.length != 0) { - curTransitionMax = transitions[0].getMax(); + transitionCount = ite.compiledAutomaton.automaton.getNumTransitions(state); + if (transitionCount != 0) { + ite.compiledAutomaton.automaton.initTransition(state, transition); + ite.compiledAutomaton.automaton.getNextTransition(transition); + curTransitionMax = transition.max; } else { curTransitionMax = -1; } @@ -132,7 +135,7 @@ void load(BytesRef frameIndexData) throws IOException { // if (DEBUG) System.out.println(" load fp=" + fp + " fpOrig=" + fpOrig + " frameIndexData=" + frameIndexData + " trans=" + (transitions.length != 0 ? transitions[0] : "n/a" + " state=" + state)); - if (frameIndexData != null && transitions.length != 0) { + if (frameIndexData != null && transitionCount != 0) { // Floor frame if (floorData.length < frameIndexData.length) { this.floorData = new byte[ArrayUtil.oversize(frameIndexData.length, 1)]; @@ -151,7 +154,8 @@ void load(BytesRef frameIndexData) throws IOException { // first block in case it has empty suffix: if (!ite.runAutomaton.isAccept(state)) { // Maybe skip floor blocks: - while (numFollowFloorBlocks != 0 && nextFloorLabel <= transitions[0].getMin()) { + assert transitionIndex == 0: "transitionIndex=" + transitionIndex; + while (numFollowFloorBlocks != 0 && nextFloorLabel <= transition.min) { fp = fpOrig + (floorDataReader.readVLong() >>> 1); numFollowFloorBlocks--; // if (DEBUG) System.out.println(" skip floor block! nextFloorLabel=" + (char) nextFloorLabel + " vs target=" + (char) transitions[0].getMin() + " newFP=" + fp + " numFollowFloorBlocks=" + numFollowFloorBlocks); diff --git a/lucene/core/src/java/org/apache/lucene/index/AutomatonTermsEnum.java b/lucene/core/src/java/org/apache/lucene/index/AutomatonTermsEnum.java index e7984e95919b..dcae44fe0558 100644 --- a/lucene/core/src/java/org/apache/lucene/index/AutomatonTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/index/AutomatonTermsEnum.java @@ -24,6 +24,7 @@ import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.automaton.ByteRunAutomaton; import org.apache.lucene.util.automaton.CompiledAutomaton; +import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.Transition; /** @@ -51,7 +52,7 @@ class AutomatonTermsEnum extends FilteredTermsEnum { // true if the automaton accepts a finite language private final boolean finite; // array of sorted transitions for each state, indexed by state number - private final Transition[][] allTransitions; + private final Automaton automaton; // for path tracking: each long records gen when we last // visited the state; we use gens to avoid having to clear private final long[] visited; @@ -79,7 +80,7 @@ public AutomatonTermsEnum(TermsEnum tenum, CompiledAutomaton compiled) { this.runAutomaton = compiled.runAutomaton; assert this.runAutomaton != null; this.commonSuffixRef = compiled.commonSuffixRef; - this.allTransitions = compiled.sortedTransitions; + this.automaton = compiled.automaton; // used for path tracking, where each bit is a numbered state. visited = new long[runAutomaton.getSize()]; @@ -124,6 +125,8 @@ protected BytesRef nextSeekTerm(final BytesRef term) throws IOException { } } + private Transition transition = new Transition(); + /** * Sets the enum to operate in linear fashion, as we have found * a looping transition at position: we set an upper bound and @@ -133,16 +136,20 @@ private void setLinear(int position) { assert linear == false; int state = runAutomaton.getInitialState(); + assert state == 0; int maxInterval = 0xff; + //System.out.println("setLinear pos=" + position + " seekbytesRef=" + seekBytesRef); for (int i = 0; i < position; i++) { state = runAutomaton.step(state, seekBytesRef.bytes[i] & 0xff); assert state >= 0: "state=" + state; } - for (int i = 0; i < allTransitions[state].length; i++) { - Transition t = allTransitions[state][i]; - if (t.getMin() <= (seekBytesRef.bytes[position] & 0xff) && - (seekBytesRef.bytes[position] & 0xff) <= t.getMax()) { - maxInterval = t.getMax(); + final int numTransitions = automaton.getNumTransitions(state); + automaton.initTransition(state, transition); + for (int i = 0; i < numTransitions; i++) { + automaton.getNextTransition(transition); + if (transition.min <= (seekBytesRef.bytes[position] & 0xff) && + (seekBytesRef.bytes[position] & 0xff) <= transition.max) { + maxInterval = transition.max; break; } } @@ -250,19 +257,19 @@ private boolean nextString(int state, int position) { seekBytesRef.length = position; visited[state] = curGen; - Transition transitions[] = allTransitions[state]; - + final int numTransitions = automaton.getNumTransitions(state); + automaton.initTransition(state, transition); // find the minimal path (lexicographic order) that is >= c - for (int i = 0; i < transitions.length; i++) { - Transition transition = transitions[i]; - if (transition.getMax() >= c) { - int nextChar = Math.max(c, transition.getMin()); + for (int i = 0; i < numTransitions; i++) { + automaton.getNextTransition(transition); + if (transition.max >= c) { + int nextChar = Math.max(c, transition.min); // append either the next sequential char, or the minimum transition seekBytesRef.grow(seekBytesRef.length + 1); seekBytesRef.length++; seekBytesRef.bytes[seekBytesRef.length - 1] = (byte) nextChar; - state = transition.getDest().getNumber(); + state = transition.dest; /* * as long as is possible, continue down the minimal path in * lexicographic order. if a loop or accept state is encountered, stop. @@ -274,13 +281,14 @@ private boolean nextString(int state, int position) { * so the below is ok, if it is not an accept state, * then there MUST be at least one transition. */ - transition = allTransitions[state][0]; - state = transition.getDest().getNumber(); + automaton.initTransition(state, transition); + automaton.getNextTransition(transition); + state = transition.dest; // append the minimum transition seekBytesRef.grow(seekBytesRef.length + 1); seekBytesRef.length++; - seekBytesRef.bytes[seekBytesRef.length - 1] = (byte) transition.getMin(); + seekBytesRef.bytes[seekBytesRef.length - 1] = (byte) transition.min; // we found a loop, record it for faster enumeration if (!finite && !linear && visited[state] == curGen) { diff --git a/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java b/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java index 6d732397d757..7aeb52058d83 100644 --- a/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java @@ -24,11 +24,11 @@ import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.FilteredTermsEnum; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermState; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.index.FilteredTermsEnum; import org.apache.lucene.util.Attribute; import org.apache.lucene.util.AttributeImpl; import org.apache.lucene.util.AttributeSource; @@ -36,8 +36,6 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.automaton.Automaton; -import org.apache.lucene.util.automaton.BasicAutomata; -import org.apache.lucene.util.automaton.BasicOperations; import org.apache.lucene.util.automaton.ByteRunAutomaton; import org.apache.lucene.util.automaton.CompiledAutomaton; import org.apache.lucene.util.automaton.LevenshteinAutomata; @@ -152,7 +150,7 @@ protected TermsEnum getAutomatonEnum(int editDistance, BytesRef lastTerm) throws IOException { final List runAutomata = initAutomata(editDistance); if (editDistance < runAutomata.size()) { - //if (BlockTreeTermsWriter.DEBUG) System.out.println("FuzzyTE.getAEnum: ed=" + editDistance + " lastTerm=" + (lastTerm==null ? "null" : lastTerm.utf8ToString())); + //System.out.println("FuzzyTE.getAEnum: ed=" + editDistance + " lastTerm=" + (lastTerm==null ? "null" : lastTerm.utf8ToString())); final CompiledAutomaton compiled = runAutomata.get(editDistance); return new AutomatonFuzzyTermsEnum(terms.intersect(compiled, lastTerm == null ? null : compiled.floor(lastTerm, new BytesRef())), runAutomata.subList(0, editDistance + 1).toArray(new CompiledAutomaton[editDistance + 1])); @@ -165,20 +163,15 @@ protected TermsEnum getAutomatonEnum(int editDistance, BytesRef lastTerm) private List initAutomata(int maxDistance) { final List runAutomata = dfaAtt.automata(); //System.out.println("cached automata size: " + runAutomata.size()); - if (runAutomata.size() <= maxDistance && + if (runAutomata.size() <= maxDistance && maxDistance <= LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) { LevenshteinAutomata builder = new LevenshteinAutomata(UnicodeUtil.newString(termText, realPrefixLength, termText.length - realPrefixLength), transpositions); + String prefix = UnicodeUtil.newString(termText, 0, realPrefixLength); for (int i = runAutomata.size(); i <= maxDistance; i++) { - Automaton a = builder.toAutomaton(i); + Automaton a = builder.toAutomaton(i, prefix); //System.out.println("compute automaton n=" + i); - // constant prefix - if (realPrefixLength > 0) { - Automaton prefix = BasicAutomata.makeString( - UnicodeUtil.newString(termText, 0, realPrefixLength)); - a = BasicOperations.concatenate(prefix, a); - } runAutomata.add(new CompiledAutomaton(a, true, false)); } } diff --git a/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java b/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java index e5664c09c7b6..2aba28c4dd8a 100644 --- a/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java @@ -1,7 +1,6 @@ package org.apache.lucene.search; import org.apache.lucene.index.Term; - import org.apache.lucene.util.ToStringUtils; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.AutomatonProvider; diff --git a/lucene/core/src/java/org/apache/lucene/search/WildcardQuery.java b/lucene/core/src/java/org/apache/lucene/search/WildcardQuery.java index 12cd77001f0d..91b473e751f7 100644 --- a/lucene/core/src/java/org/apache/lucene/search/WildcardQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/WildcardQuery.java @@ -17,14 +17,14 @@ * limitations under the License. */ +import java.util.ArrayList; +import java.util.List; + import org.apache.lucene.index.Term; import org.apache.lucene.util.ToStringUtils; +import org.apache.lucene.util.automaton.Automata; +import org.apache.lucene.util.automaton.Operations; import org.apache.lucene.util.automaton.Automaton; -import org.apache.lucene.util.automaton.BasicAutomata; -import org.apache.lucene.util.automaton.BasicOperations; - -import java.util.ArrayList; -import java.util.List; /** Implements the wildcard search query. Supported wildcards are *, which * matches any character sequence (including the empty one), and ?, @@ -72,26 +72,26 @@ public static Automaton toAutomaton(Term wildcardquery) { int length = Character.charCount(c); switch(c) { case WILDCARD_STRING: - automata.add(BasicAutomata.makeAnyString()); + automata.add(Automata.makeAnyString()); break; case WILDCARD_CHAR: - automata.add(BasicAutomata.makeAnyChar()); + automata.add(Automata.makeAnyChar()); break; case WILDCARD_ESCAPE: // add the next codepoint instead, if it exists if (i + length < wildcardText.length()) { final int nextChar = wildcardText.codePointAt(i + length); length += Character.charCount(nextChar); - automata.add(BasicAutomata.makeChar(nextChar)); + automata.add(Automata.makeChar(nextChar)); break; } // else fallthru, lenient parsing with a trailing \ default: - automata.add(BasicAutomata.makeChar(c)); + automata.add(Automata.makeChar(c)); } i += length; } - return BasicOperations.concatenate(automata); + return Operations.concatenate(automata); } /** diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java similarity index 57% rename from lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java rename to lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java index 7f51a37db95f..2d327cf4706d 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java @@ -38,18 +38,16 @@ * * @lucene.experimental */ -final public class BasicAutomata { +final public class Automata { - private BasicAutomata() {} + private Automata() {} /** * Returns a new (deterministic) automaton with the empty language. */ public static Automaton makeEmpty() { Automaton a = new Automaton(); - State s = new State(); - a.initial = s; - a.deterministic = true; + a.finishState(); return a; } @@ -58,8 +56,8 @@ public static Automaton makeEmpty() { */ public static Automaton makeEmptyString() { Automaton a = new Automaton(); - a.singleton = ""; - a.deterministic = true; + a.createState(); + a.setAccept(0, true); return a; } @@ -68,12 +66,10 @@ public static Automaton makeEmptyString() { */ public static Automaton makeAnyString() { Automaton a = new Automaton(); - State s = new State(); - a.initial = s; - s.accept = true; - s.addTransition(new Transition(Character.MIN_CODE_POINT, Character.MAX_CODE_POINT, - s)); - a.deterministic = true; + int s = a.createState(); + a.setAccept(s, true); + a.addTransition(s, s, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT); + a.finishState(); return a; } @@ -83,31 +79,43 @@ public static Automaton makeAnyString() { public static Automaton makeAnyChar() { return makeCharRange(Character.MIN_CODE_POINT, Character.MAX_CODE_POINT); } - + + /** Accept any single character starting from the specified state, returning the new state */ + public static int appendAnyChar(Automaton a, int state) { + int newState = a.createState(); + a.addTransition(state, newState, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT); + return newState; + } + /** * Returns a new (deterministic) automaton that accepts a single codepoint of * the given value. */ public static Automaton makeChar(int c) { - Automaton a = new Automaton(); - a.singleton = new String(Character.toChars(c)); - a.deterministic = true; - return a; + return makeCharRange(c, c); } - + + /** Appends the specified character to the specified state, returning a new state. */ + public static int appendChar(Automaton a, int state, int c) { + int newState = a.createState(); + a.addTransition(state, newState, c, c); + return newState; + } + /** * Returns a new (deterministic) automaton that accepts a single codepoint whose * value is in the given interval (including both end points). */ public static Automaton makeCharRange(int min, int max) { - if (min == max) return makeChar(min); + if (min > max) { + return makeEmpty(); + } Automaton a = new Automaton(); - State s1 = new State(); - State s2 = new State(); - a.initial = s1; - s2.accept = true; - if (min <= max) s1.addTransition(new Transition(min, max, s2)); - a.deterministic = true; + int s1 = a.createState(); + int s2 = a.createState(); + a.setAccept(s2, true); + a.addTransition(s1, s2, min, max); + a.finishState(); return a; } @@ -115,10 +123,13 @@ public static Automaton makeCharRange(int min, int max) { * Constructs sub-automaton corresponding to decimal numbers of length * x.substring(n).length(). */ - private static State anyOfRightLength(String x, int n) { - State s = new State(); - if (x.length() == n) s.setAccept(true); - else s.addTransition(new Transition('0', '9', anyOfRightLength(x, n + 1))); + private static int anyOfRightLength(Automaton.Builder builder, String x, int n) { + int s = builder.createState(); + if (x.length() == n) { + builder.setAccept(s, true); + } else { + builder.addTransition(s, anyOfRightLength(builder, x, n + 1), '0', '9'); + } return s; } @@ -126,17 +137,20 @@ private static State anyOfRightLength(String x, int n) { * Constructs sub-automaton corresponding to decimal numbers of value at least * x.substring(n) and length x.substring(n).length(). */ - private static State atLeast(String x, int n, Collection initials, + private static int atLeast(Automaton.Builder builder, String x, int n, Collection initials, boolean zeros) { - State s = new State(); - if (x.length() == n) s.setAccept(true); - else { - if (zeros) initials.add(s); + int s = builder.createState(); + if (x.length() == n) { + builder.setAccept(s, true); + } else { + if (zeros) { + initials.add(s); + } char c = x.charAt(n); - s.addTransition(new Transition(c, atLeast(x, n + 1, initials, zeros - && c == '0'))); - if (c < '9') s.addTransition(new Transition((char) (c + 1), '9', - anyOfRightLength(x, n + 1))); + builder.addTransition(s, atLeast(builder, x, n + 1, initials, zeros && c == '0'), c); + if (c < '9') { + builder.addTransition(s, anyOfRightLength(builder, x, n + 1), (char) (c + 1), '9'); + } } return s; } @@ -145,14 +159,16 @@ private static State atLeast(String x, int n, Collection initials, * Constructs sub-automaton corresponding to decimal numbers of value at most * x.substring(n) and length x.substring(n).length(). */ - private static State atMost(String x, int n) { - State s = new State(); - if (x.length() == n) s.setAccept(true); - else { + private static int atMost(Automaton.Builder builder, String x, int n) { + int s = builder.createState(); + if (x.length() == n) { + builder.setAccept(s, true); + } else { char c = x.charAt(n); - s.addTransition(new Transition(c, atMost(x, (char) n + 1))); - if (c > '0') s.addTransition(new Transition('0', (char) (c - 1), - anyOfRightLength(x, n + 1))); + builder.addTransition(s, atMost(builder, x, (char) n + 1), c); + if (c > '0') { + builder.addTransition(s, anyOfRightLength(builder, x, n + 1), '0', (char) (c - 1)); + } } return s; } @@ -162,27 +178,32 @@ private static State atMost(String x, int n) { * x.substring(n) and y.substring(n) and of length x.substring(n).length() * (which must be equal to y.substring(n).length()). */ - private static State between(String x, String y, int n, - Collection initials, boolean zeros) { - State s = new State(); - if (x.length() == n) s.setAccept(true); - else { - if (zeros) initials.add(s); + private static int between(Automaton.Builder builder, + String x, String y, int n, + Collection initials, boolean zeros) { + int s = builder.createState(); + if (x.length() == n) { + builder.setAccept(s, true); + } else { + if (zeros) { + initials.add(s); + } char cx = x.charAt(n); char cy = y.charAt(n); - if (cx == cy) s.addTransition(new Transition(cx, between(x, y, n + 1, - initials, zeros && cx == '0'))); - else { // cx0, use fixed number of digits (strings must be prefixed * by 0's to obtain the right length) - otherwise, the number of - * digits is not fixed + * digits is not fixed (any number of leading 0s is accepted) * @exception IllegalArgumentException if min>max or if numbers in the * interval cannot be expressed with the given fixed number of * digits */ public static Automaton makeInterval(int min, int max, int digits) throws IllegalArgumentException { - Automaton a = new Automaton(); String x = Integer.toString(min); String y = Integer.toString(max); - if (min > max || (digits > 0 && y.length() > digits)) throw new IllegalArgumentException(); + if (min > max || (digits > 0 && y.length() > digits)) { + throw new IllegalArgumentException(); + } int d; if (digits > 0) d = digits; else d = y.length(); StringBuilder bx = new StringBuilder(); - for (int i = x.length(); i < d; i++) + for (int i = x.length(); i < d; i++) { bx.append('0'); + } bx.append(x); x = bx.toString(); StringBuilder by = new StringBuilder(); - for (int i = y.length(); i < d; i++) + for (int i = y.length(); i < d; i++) { by.append('0'); + } by.append(y); y = by.toString(); - Collection initials = new ArrayList<>(); - a.initial = between(x, y, 0, initials, digits <= 0); + + Automaton.Builder builder = new Automaton.Builder(); + if (digits <= 0) { - ArrayList pairs = new ArrayList<>(); - for (State p : initials) - if (a.initial != p) pairs.add(new StatePair(a.initial, p)); - BasicOperations.addEpsilons(a, pairs); - a.initial.addTransition(new Transition('0', a.initial)); - a.deterministic = false; - } else a.deterministic = true; - a.checkMinimizeAlways(); - return a; + // Reserve the "real" initial state: + builder.createState(); + } + + Collection initials = new ArrayList<>(); + + between(builder, x, y, 0, initials, digits <= 0); + + Automaton a1 = builder.finish(); + + if (digits <= 0) { + a1.addTransition(0, 0, '0'); + for (int p : initials) { + a1.addEpsilon(0, p); + } + a1.finishState(); + } + + return a1; } /** @@ -236,22 +271,39 @@ public static Automaton makeInterval(int min, int max, int digits) */ public static Automaton makeString(String s) { Automaton a = new Automaton(); - a.singleton = s; - a.deterministic = true; + int lastState = a.createState(); + for (int i = 0, cp = 0; i < s.length(); i += Character.charCount(cp)) { + int state = a.createState(); + cp = s.codePointAt(i); + a.addTransition(lastState, state, cp, cp); + lastState = state; + } + + a.setAccept(lastState, true); + a.finishState(); + + assert a.isDeterministic(); + assert Operations.hasDeadStates(a) == false; + return a; } + /** + * Returns a new (deterministic) automaton that accepts the single given + * string from the specified unicode code points. + */ public static Automaton makeString(int[] word, int offset, int length) { Automaton a = new Automaton(); - a.setDeterministic(true); - State s = new State(); - a.initial = s; + a.createState(); + int s = 0; for (int i = offset; i < offset+length; i++) { - State s2 = new State(); - s.addTransition(new Transition(word[i], s2)); + int s2 = a.createState(); + a.addTransition(s, s2, word[i]); s = s2; } - s.accept = true; + a.setAccept(s, true); + a.finishState(); + return a; } diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java index 9cdab3916642..78e096b242fa 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java @@ -1,777 +1,773 @@ +package org.apache.lucene.util.automaton; + /* - * dk.brics.automaton - * - * Copyright (c) 2001-2009 Anders Moeller - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ -package org.apache.lucene.util.automaton; - +//import java.io.IOException; +//import java.io.PrintWriter; import java.util.Arrays; import java.util.BitSet; -import java.util.Collection; -import java.util.HashMap; import java.util.HashSet; -import java.util.LinkedList; -import java.util.List; import java.util.Set; import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.RamUsageEstimator; - -/** - * Finite-state automaton with regular expression operations. - *

- * Class invariants: - *

    - *
  • An automaton is either represented explicitly (with {@link State} and - * {@link Transition} objects) or with a singleton string (see - * {@link #getSingleton()} and {@link #expandSingleton()}) in case the automaton - * is known to accept exactly one string. (Implicitly, all states and - * transitions of an automaton are reachable from its initial state.) - *
  • Automata are always reduced (see {@link #reduce()}) and have no - * transitions to dead states (see {@link #removeDeadTransitions()}). - *
  • If an automaton is nondeterministic, then {@link #isDeterministic()} - * returns false (but the converse is not required). - *
  • Automata provided as input to operations are generally assumed to be - * disjoint. - *
- *

- * If the states or transitions are manipulated manually, the - * {@link #restoreInvariant()} and {@link #setDeterministic(boolean)} methods - * should be used afterwards to restore representation invariants that are - * assumed by the built-in automata operations. - * - *

- *

- * Note: This class has internal mutable state and is not thread safe. It is - * the caller's responsibility to ensure any necessary synchronization if you - * wish to use the same Automaton from multiple threads. In general it is instead - * recommended to use a {@link RunAutomaton} for multithreaded matching: it is immutable, - * thread safe, and much faster. - *

- * @lucene.experimental - */ -public class Automaton implements Cloneable { - - /** - * Minimize using Hopcroft's O(n log n) algorithm. This is regarded as one of - * the most generally efficient algorithms that exist. - * - * @see #setMinimization(int) - */ - public static final int MINIMIZE_HOPCROFT = 2; - - /** Selects minimization algorithm (default: MINIMIZE_HOPCROFT). */ - static int minimization = MINIMIZE_HOPCROFT; - - /** Initial state of this automaton. */ - State initial; - - /** - * If true, then this automaton is definitely deterministic (i.e., there are - * no choices for any run, but a run may crash). - */ - boolean deterministic; - - /** Extra data associated with this automaton. */ - transient Object info; - - /** - * Hash code. Recomputed by {@link MinimizationOperations#minimize(Automaton)} - */ - //int hash_code; - - /** Singleton string. Null if not applicable. */ - String singleton; - - /** Minimize always flag. */ - static boolean minimize_always = false; - - /** - * Selects whether operations may modify the input automata (default: - * false). - */ - static boolean allow_mutation = false; - - /** - * Constructs a new automaton that accepts the empty language. Using this - * constructor, automata can be constructed manually from {@link State} and - * {@link Transition} objects. - * - * @see State - * @see Transition - */ - public Automaton(State initial) { - this.initial = initial; - deterministic = true; - singleton = null; - } +import org.apache.lucene.util.InPlaceMergeSorter; +import org.apache.lucene.util.Sorter; + + +// TODO +// - could use packed int arrays instead +// - could encode dest w/ delta from to? + +/** Represents an automaton and all its states and transitions. States + * are integers and must be created using {@link #createState}. Mark a + * state as an accept state using {@link #setAccept}. Add transitions + * using {@link #addTransition}. Each state must have all of its + * transitions added at once; if this is too restrictive then use + * {@link Automaton.Builder} instead. State 0 is always the + * initial state. Once a state is finished, either + * because you've starting adding transitions to another state or you + * call {@link #finishState}, then that states transitions are sorted + * (first by min, then max, then dest) and reduced (transitions with + * adjacent labels going to the same dest are combined). + * + * @lucene.experimental */ +public class Automaton { + /** Where we next write to the int[] states; this increments by 2 for + * each added state because we pack a pointer to the transitions + * array and a count of how many transitions leave the state. */ + private int nextState; + + /** Where we next write to in int[] transitions; this + * increments by 3 for each added transition because we + * pack min, max, dest in sequence. */ + private int nextTransition; + + /** Current state we are adding transitions to; the caller + * must add all transitions for this state before moving + * onto another state. */ + private int curState = -1; + + /** Index in the transitions array, where this states + * leaving transitions are stored, or -1 if this state + * has not added any transitions yet, followed by number + * of transitions. */ + private int[] states = new int[4]; + + /** Holds toState, min, max for each transition. */ + private int[] transitions = new int[6]; + + private final BitSet isAccept = new BitSet(4); + + /** True if no state has two transitions leaving with the same label. */ + private boolean deterministic = true; + + /** Sole constructor; creates an automaton with no states. */ public Automaton() { - this(new State()); - } - - /** - * Selects minimization algorithm (default: MINIMIZE_HOPCROFT). - * - * @param algorithm minimization algorithm - */ - static public void setMinimization(int algorithm) { - minimization = algorithm; - } - - /** - * Sets or resets minimize always flag. If this flag is set, then - * {@link MinimizationOperations#minimize(Automaton)} will automatically be - * invoked after all operations that otherwise may produce non-minimal - * automata. By default, the flag is not set. - * - * @param flag if true, the flag is set - */ - static public void setMinimizeAlways(boolean flag) { - minimize_always = flag; } - - /** - * Sets or resets allow mutate flag. If this flag is set, then all automata - * operations may modify automata given as input; otherwise, operations will - * always leave input automata languages unmodified. By default, the flag is - * not set. - * - * @param flag if true, the flag is set - * @return previous value of the flag - */ - static public boolean setAllowMutate(boolean flag) { - boolean b = allow_mutation; - allow_mutation = flag; - return b; - } - - /** - * Returns the state of the allow mutate flag. If this flag is set, then all - * automata operations may modify automata given as input; otherwise, - * operations will always leave input automata languages unmodified. By - * default, the flag is not set. - * - * @return current value of the flag - */ - static boolean getAllowMutate() { - return allow_mutation; - } - - void checkMinimizeAlways() { - if (minimize_always) MinimizationOperations.minimize(this); + + /** Create a new state. */ + public int createState() { + growStates(); + int state = nextState/2; + states[nextState] = -1; + nextState += 2; + return state; } - - boolean isSingleton() { - return singleton != null; + + /** Set or clear this state as an accept state. */ + public void setAccept(int state, boolean accept) { + if (state >= getNumStates()) { + throw new IllegalArgumentException("state=" + state + " is out of bounds (numStates=" + getNumStates() + ")"); + } + if (accept) { + isAccept.set(state); + } else { + isAccept.clear(state); + } } - - /** - * Returns the singleton string for this automaton. An automaton that accepts - * exactly one string may be represented in singleton mode. In that - * case, this method may be used to obtain the string. - * - * @return string, null if this automaton is not in singleton mode. - */ - public String getSingleton() { - return singleton; + + /** Sugar to get all transitions for all states. This is + * object-heavy; it's better to iterate state by state instead. */ + public Transition[][] getSortedTransitions() { + int numStates = getNumStates(); + Transition[][] transitions = new Transition[numStates][]; + for(int s=0;s= nextState/2) { + throw new IllegalArgumentException("source=" + source + " is out of bounds (maxState is " + (nextState/2-1) + ")"); + } + if (dest >= nextState/2) { + throw new IllegalArgumentException("dest=" + dest + " is out of bounds (max state is " + (nextState/2-1) + ")"); + } + + growTransitions(); + if (curState != source) { + if (curState != -1) { + finishCurrentState(); + } + + // Move to next source: + curState = source; + if (states[2*curState] != -1) { + throw new IllegalStateException("from state (" + source + ") already had transitions added"); + } + assert states[2*curState+1] == 0; + states[2*curState] = nextTransition; + } + + transitions[nextTransition++] = dest; + transitions[nextTransition++] = min; + transitions[nextTransition++] = max; + + // Increment transition count for this state + states[2*curState+1]++; } - - /** - * Associates extra information with this automaton. - * - * @param info extra information - */ - public void setInfo(Object info) { - this.info = info; + + /** Add a [virtual] epsilon transition between source and dest. + * Dest state must already have all transitions added because this + * method simply copies those same transitions over to source. */ + public void addEpsilon(int source, int dest) { + Transition t = new Transition(); + int count = initTransition(dest, t); + for(int i=0;i 0; - public State[] getNumberedStates() { - if (numberedStates == null) { - expandSingleton(); - final Set visited = new HashSet<>(); - final LinkedList worklist = new LinkedList<>(); - State states[] = new State[4]; - int upto = 0; - worklist.add(initial); - visited.add(initial); - initial.number = upto; - states[upto] = initial; - upto++; - while (worklist.size() > 0) { - State s = worklist.removeFirst(); - for (int i=0;i max) { + max = tMax; + } + } else { + if (dest != -1) { + transitions[offset+3*upto] = dest; + transitions[offset+3*upto+1] = min; + transitions[offset+3*upto+2] = max; upto++; } + min = tMin; + max = tMax; + } + } else { + if (dest != -1) { + transitions[offset+3*upto] = dest; + transitions[offset+3*upto+1] = min; + transitions[offset+3*upto+2] = max; + upto++; } + dest = tDest; + min = tMin; + max = tMax; } - if (states.length != upto) { - final State[] newArray = new State[upto]; - System.arraycopy(states, 0, newArray, 0, upto); - states = newArray; + } + + if (dest != -1) { + // Last transition + transitions[offset+3*upto] = dest; + transitions[offset+3*upto+1] = min; + transitions[offset+3*upto+2] = max; + upto++; + } + + nextTransition -= (numTransitions-upto)*3; + states[2*curState+1] = upto; + + // Sort transitions by min/max/dest: + minMaxDestSorter.sort(start, start+upto); + + if (deterministic && upto > 1) { + int lastMax = transitions[offset+2]; + for(int i=1;i= states.length) { + states = ArrayUtil.grow(states, nextState+2); + } } - /** - * Returns the set of reachable accept states. - * - * @return set of {@link State} objects - */ - public Set getAcceptStates() { - expandSingleton(); - HashSet accepts = new HashSet<>(); - HashSet visited = new HashSet<>(); - LinkedList worklist = new LinkedList<>(); - worklist.add(initial); - visited.add(initial); - while (worklist.size() > 0) { - State s = worklist.removeFirst(); - if (s.accept) accepts.add(s); - for (Transition t : s.getTransitions()) - if (!visited.contains(t.to)) { - visited.add(t.to); - worklist.add(t.to); - } + private void growTransitions() { + if (nextTransition+3 >= transitions.length) { + transitions = ArrayUtil.grow(transitions, nextTransition+3); } - return accepts; } - - /** - * Adds transitions to explicit crash state to ensure that transition function - * is total. - */ - void totalize() { - State s = new State(); - s.addTransition(new Transition(Character.MIN_CODE_POINT, Character.MAX_CODE_POINT, - s)); - for (State p : getNumberedStates()) { - int maxi = Character.MIN_CODE_POINT; - p.sortTransitions(Transition.CompareByMinMaxThenDest); - for (Transition t : p.getTransitions()) { - if (t.min > maxi) p.addTransition(new Transition(maxi, - (t.min - 1), s)); - if (t.max + 1 > maxi) maxi = t.max + 1; + + /** Sorts transitions by dest, ascending, then min label ascending, then max label ascending */ + private final Sorter destMinMaxSorter = new InPlaceMergeSorter() { + + private void swapOne(int i, int j) { + int x = transitions[i]; + transitions[i] = transitions[j]; + transitions[j] = x; + } + + @Override + protected void swap(int i, int j) { + int iStart = 3*i; + int jStart = 3*j; + swapOne(iStart, jStart); + swapOne(iStart+1, jStart+1); + swapOne(iStart+2, jStart+2); + }; + + @Override + protected int compare(int i, int j) { + int iStart = 3*i; + int jStart = 3*j; + + // First dest: + int iDest = transitions[iStart]; + int jDest = transitions[jStart]; + if (iDest < jDest) { + return -1; + } else if (iDest > jDest) { + return 1; + } + + // Then min: + int iMin = transitions[iStart+1]; + int jMin = transitions[jStart+1]; + if (iMin < jMin) { + return -1; + } else if (iMin > jMin) { + return 1; + } + + // Then max: + int iMax = transitions[iStart+2]; + int jMax = transitions[jStart+2]; + if (iMax < jMax) { + return -1; + } else if (iMax > jMax) { + return 1; + } + + return 0; + } + }; + + /** Sorts transitions by min label, ascending, then max label ascending, then dest ascending */ + private final Sorter minMaxDestSorter = new InPlaceMergeSorter() { + + private void swapOne(int i, int j) { + int x = transitions[i]; + transitions[i] = transitions[j]; + transitions[j] = x; + } + + @Override + protected void swap(int i, int j) { + int iStart = 3*i; + int jStart = 3*j; + swapOne(iStart, jStart); + swapOne(iStart+1, jStart+1); + swapOne(iStart+2, jStart+2); + }; + + @Override + protected int compare(int i, int j) { + int iStart = 3*i; + int jStart = 3*j; + + // First min: + int iMin = transitions[iStart+1]; + int jMin = transitions[jStart+1]; + if (iMin < jMin) { + return -1; + } else if (iMin > jMin) { + return 1; + } + + // Then max: + int iMax = transitions[iStart+2]; + int jMax = transitions[jStart+2]; + if (iMax < jMax) { + return -1; + } else if (iMax > jMax) { + return 1; + } + + // Then dest: + int iDest = transitions[iStart]; + int jDest = transitions[jStart]; + if (iDest < jDest) { + return -1; + } else if (iDest > jDest) { + return 1; + } + + return 0; } - if (maxi <= Character.MAX_CODE_POINT) p.addTransition(new Transition( - maxi, Character.MAX_CODE_POINT, s)); + }; + + /** Initialize the provided Transition to iterate through all transitions + * leaving the specified state. You must call {@link #getNextTransition} to + * get each transition. Returns the number of transitions + * leaving this state. */ + public int initTransition(int state, Transition t) { + assert state < nextState/2: "state=" + state + " nextState=" + nextState; + t.source = state; + t.transitionUpto = states[2*state]; + return getNumTransitions(state); + } + + /** Iterate to the next transition after the provided one */ + public void getNextTransition(Transition t) { + // Make sure there is still a transition left: + assert (t.transitionUpto+3 - states[2*t.source]) <= 3*states[2*t.source+1]; + t.dest = transitions[t.transitionUpto++]; + t.min = transitions[t.transitionUpto++]; + t.max = transitions[t.transitionUpto++]; + } + + /** Fill the provided {@link Transition} with the index'th + * transition leaving the specified state. */ + public void getTransition(int state, int index, Transition t) { + int i = states[2*state] + 3*index; + t.source = state; + t.dest = transitions[i++]; + t.min = transitions[i++]; + t.max = transitions[i++]; + } + + static void appendCharString(int c, StringBuilder b) { + if (c >= 0x21 && c <= 0x7e && c != '\\' && c != '"') b.appendCodePoint(c); + else { + b.append("\\\\U"); + String s = Integer.toHexString(c); + if (c < 0x10) b.append("0000000").append(s); + else if (c < 0x100) b.append("000000").append(s); + else if (c < 0x1000) b.append("00000").append(s); + else if (c < 0x10000) b.append("0000").append(s); + else if (c < 0x100000) b.append("000").append(s); + else if (c < 0x1000000) b.append("00").append(s); + else if (c < 0x10000000) b.append("0").append(s); + else b.append(s); } - clearNumberedStates(); } - - /** - * Restores representation invariant. This method must be invoked before any - * built-in automata operation is performed if automaton states or transitions - * are manipulated manually. - * - * @see #setDeterministic(boolean) - */ - public void restoreInvariant() { - removeDeadTransitions(); + + /* + public void writeDot(String fileName) { + if (fileName.indexOf('/') == -1) { + fileName = "/l/la/lucene/core/" + fileName + ".dot"; + } + try { + PrintWriter pw = new PrintWriter(fileName); + pw.println(toDot()); + pw.close(); + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } } - - /** - * Reduces this automaton. An automaton is "reduced" by combining overlapping - * and adjacent edge intervals with same destination. - */ - public void reduce() { - final State[] states = getNumberedStates(); - if (isSingleton()) return; - for (State s : states) - s.reduce(); + */ + + /** Returns the dot (graphviz) representation of this automaton. + * This is extremely useful for visualizing the automaton. */ + public String toDot() { + // TODO: breadth first search so we can see get layered output... + + StringBuilder b = new StringBuilder(); + b.append("digraph Automaton {\n"); + b.append(" rankdir = LR\n"); + final int numStates = getNumStates(); + if (numStates > 0) { + b.append(" initial [shape=plaintext,label=\"0\"]\n"); + b.append(" initial -> 0\n"); + } + + Transition t = new Transition(); + + for(int state=0;state "); + b.append(t.dest); + b.append(" [label=\""); + appendCharString(t.min, b); + if (t.max != t.min) { + b.append('-'); + appendCharString(t.max, b); + } + b.append("\"]\n"); + //System.out.println(" t=" + t); + } + } + b.append('}'); + return b.toString(); } - + /** * Returns sorted array of all interval start points. */ int[] getStartPoints() { - final State[] states = getNumberedStates(); Set pointset = new HashSet<>(); pointset.add(Character.MIN_CODE_POINT); - for (State s : states) { - for (Transition t : s.getTransitions()) { - pointset.add(t.min); - if (t.max < Character.MAX_CODE_POINT) pointset.add((t.max + 1)); + //System.out.println("getStartPoints"); + for (int s=0;s live = new HashSet<>(); - for (State q : states) { - if (q.isAccept()) { - live.add(q); + * @param state starting state + * @param label codepoint to look up + * @return destination state, -1 if no matching outgoing transition + */ + public int step(int state, int label) { + assert state >= 0; + assert label >= 0; + int trans = states[2*state]; + int limit = trans + 3*states[2*state+1]; + // TODO: we could do bin search; transitions are sorted + while (trans < limit) { + int dest = transitions[trans]; + int min = transitions[trans+1]; + int max = transitions[trans+2]; + if (min <= label && label <= max) { + return dest; } + trans += 3; + } + + return -1; + } + + /** Records new states and transitions and then {@link + * #finish} creates the {@link Automaton}. Use this + * when you cannot create the Automaton directly because + * it's too restrictive to have to add all transitions + * leaving each state at once. */ + public static class Builder { + private int[] transitions = new int[4]; + private int nextTransition; + private final Automaton a = new Automaton(); + + /** Sole constructor. */ + public Builder() { + } + + /** Add a new transition with min = max = label. */ + public void addTransition(int source, int dest, int label) { + addTransition(source, dest, label, label); } - // map> - @SuppressWarnings({"rawtypes","unchecked"}) Set map[] = new Set[states.length]; - for (int i = 0; i < map.length; i++) - map[i] = new HashSet<>(); - for (State s : states) { - for(int i=0;i worklist = new LinkedList<>(live); - while (worklist.size() > 0) { - State s = worklist.removeFirst(); - for (State p : map[s.number]) - if (!live.contains(p)) { - live.add(p); - worklist.add(p); + + /** Sorts transitions first then min label ascending, then + * max label ascending, then dest ascending */ + private final Sorter sorter = new InPlaceMergeSorter() { + + private void swapOne(int i, int j) { + int x = transitions[i]; + transitions[i] = transitions[j]; + transitions[j] = x; } - } - return live.toArray(new State[live.size()]); - } + @Override + protected void swap(int i, int j) { + int iStart = 4*i; + int jStart = 4*j; + swapOne(iStart, jStart); + swapOne(iStart+1, jStart+1); + swapOne(iStart+2, jStart+2); + swapOne(iStart+3, jStart+3); + }; - /** - * Removes transitions to dead states and calls {@link #reduce()}. - * (A state is "dead" if no accept state is - * reachable from it.) - */ - public void removeDeadTransitions() { - final State[] states = getNumberedStates(); - //clearHashCode(); - if (isSingleton()) return; - State[] live = getLiveStates(); - - BitSet liveSet = new BitSet(states.length); - for (State s : live) - liveSet.set(s.number); - - for (State s : states) { - // filter out transitions to dead states: - int upto = 0; - for(int i=0;i jSrc) { + return 1; + } + + // Then min: + int iMin = transitions[iStart+2]; + int jMin = transitions[jStart+2]; + if (iMin < jMin) { + return -1; + } else if (iMin > jMin) { + return 1; + } + + // Then max: + int iMax = transitions[iStart+3]; + int jMax = transitions[jStart+3]; + if (iMax < jMax) { + return -1; + } else if (iMax > jMax) { + return 1; + } + + // First dest: + int iDest = transitions[iStart+1]; + int jDest = transitions[jStart+1]; + if (iDest < jDest) { + return -1; + } else if (iDest > jDest) { + return 1; + } + + return 0; } + }; + + /** Compiles all added states and transitions into a new {@code Automaton} + * and returns it. */ + public Automaton finish() { + //System.out.println("LA.Builder.finish: count=" + (nextTransition/4)); + // TODO: we could make this more efficient, + // e.g. somehow xfer the int[] to the automaton, or + // alloc exactly the right size from the automaton + //System.out.println("finish pending"); + sorter.sort(0, nextTransition/4); + int upto = 0; + while (upto < nextTransition) { + a.addTransition(transitions[upto], + transitions[upto+1], + transitions[upto+2], + transitions[upto+3]); + upto += 4; } - s.numTransitions = upto; - } - for(int i=0;i 0) { - setNumberedStates(live); - } else { - // sneaky corner case -- if machine accepts no strings - clearNumberedStates(); + + /** Create a new state. */ + public int createState() { + return a.createState(); } - reduce(); - } - - /** - * Returns a sorted array of transitions for each state (and sets state - * numbers). - */ - public Transition[][] getSortedTransitions() { - final State[] states = getNumberedStates(); - Transition[][] transitions = new Transition[states.length][]; - for (State s : states) { - s.sortTransitions(Transition.CompareByMinMaxThenDest); - s.trimTransitionsArray(); - transitions[s.number] = s.transitionsArray; - assert s.transitionsArray != null; + + /** Set or clear this state as an accept state. */ + public void setAccept(int state, boolean accept) { + a.setAccept(state, accept); } - return transitions; - } - - /** - * Expands singleton representation to normal representation. Does nothing if - * not in singleton representation. - */ - public void expandSingleton() { - if (isSingleton()) { - State p = new State(); - initial = p; - for (int i = 0, cp = 0; i < singleton.length(); i += Character.charCount(cp)) { - State q = new State(); - p.addTransition(new Transition(cp = singleton.codePointAt(i), q)); - p = q; - } - p.accept = true; - deterministic = true; - singleton = null; + + /** Returns true if this state is an accept state. */ + public boolean isAccept(int state) { + return a.isAccept(state); } - } - - /** - * Returns the number of states in this automaton. - */ - public int getNumberOfStates() { - if (isSingleton()) return singleton.codePointCount(0, singleton.length()) + 1; - return getNumberedStates().length; - } - - /** - * Returns the number of transitions in this automaton. This number is counted - * as the total number of edges, where one edge may be a character interval. - */ - public int getNumberOfTransitions() { - if (isSingleton()) return singleton.codePointCount(0, singleton.length()); - int c = 0; - for (State s : getNumberedStates()) - c += s.numTransitions(); - return c; - } - - @Override - public boolean equals(Object obj) { - throw new UnsupportedOperationException("use BasicOperations.sameLanguage instead"); - } - @Override - public int hashCode() { - throw new UnsupportedOperationException(); - } - - /** - * Must be invoked when the stored hash code may no longer be valid. - */ - /* - void clearHashCode() { - hash_code = 0; - } - */ - - /** - * Returns a string representation of this automaton. - */ - @Override - public String toString() { - StringBuilder b = new StringBuilder(); - if (isSingleton()) { - b.append("singleton: "); - int length = singleton.codePointCount(0, singleton.length()); - int codepoints[] = new int[length]; - for (int i = 0, j = 0, cp = 0; i < singleton.length(); i += Character.charCount(cp)) - codepoints[j++] = cp = singleton.codePointAt(i); - for (int c : codepoints) - Transition.appendCharString(c, b); - b.append("\n"); - } else { - State[] states = getNumberedStates(); - b.append("initial state: ").append(initial.number).append("\n"); - for (State s : states) - b.append(s.toString()); + /** How many states this automaton has. */ + public int getNumStates() { + return a.getNumStates(); } - return b.toString(); - } - - /** - * Returns Graphviz Dot representation of this automaton. - */ - public String toDot() { - StringBuilder b = new StringBuilder("digraph Automaton {\n"); - b.append(" rankdir = LR;\n"); - State[] states = getNumberedStates(); - for (State s : states) { - b.append(" ").append(s.number); - if (s.accept) b.append(" [shape=doublecircle,label=\"" + s.number + "\"];\n"); - else b.append(" [shape=circle,label=\" " + s.number + "\"];\n"); - if (s == initial) { - b.append(" initial [shape=plaintext,label=\"\"];\n"); - b.append(" initial -> ").append(s.number).append("\n"); - } - for (Transition t : s.getTransitions()) { - b.append(" ").append(s.number); - t.appendDot(b); + + /** Copies over all states/transitions from other. */ + public void copy(Automaton other) { + int offset = getNumStates(); + int otherNumStates = other.getNumStates(); + for(int s=0;sallow_mutation is - * set, expands if singleton. - */ - Automaton cloneExpandedIfRequired() { - if (allow_mutation) { - expandSingleton(); - return this; - } else return cloneExpanded(); - } - - /** - * Returns a clone of this automaton. - */ - @Override - public Automaton clone() { - try { - Automaton a = (Automaton) super.clone(); - if (!isSingleton()) { - HashMap m = new HashMap<>(); - State[] states = getNumberedStates(); - for (State s : states) - m.put(s, new State()); - for (State s : states) { - State p = m.get(s); - p.accept = s.accept; - if (s == initial) a.initial = p; - for (Transition t : s.getTransitions()) - p.addTransition(new Transition(t.min, t.max, m.get(t.to))); + Transition t = new Transition(); + for(int s=0;sallow_mutation flag is set. - */ - Automaton cloneIfRequired() { - if (allow_mutation) return this; - else return clone(); - } - - /** - * See {@link BasicOperations#concatenate(Automaton, Automaton)}. - */ - public Automaton concatenate(Automaton a) { - return BasicOperations.concatenate(this, a); - } - - /** - * See {@link BasicOperations#concatenate(List)}. - */ - static public Automaton concatenate(List l) { - return BasicOperations.concatenate(l); - } - - /** - * See {@link BasicOperations#optional(Automaton)}. - */ - public Automaton optional() { - return BasicOperations.optional(this); - } - - /** - * See {@link BasicOperations#repeat(Automaton)}. - */ - public Automaton repeat() { - return BasicOperations.repeat(this); - } - - /** - * See {@link BasicOperations#repeat(Automaton, int)}. - */ - public Automaton repeat(int min) { - return BasicOperations.repeat(this, min); - } - - /** - * See {@link BasicOperations#repeat(Automaton, int, int)}. - */ - public Automaton repeat(int min, int max) { - return BasicOperations.repeat(this, min, max); - } - - /** - * See {@link BasicOperations#complement(Automaton)}. - */ - public Automaton complement() { - return BasicOperations.complement(this); - } - - /** - * See {@link BasicOperations#minus(Automaton, Automaton)}. - */ - public Automaton minus(Automaton a) { - return BasicOperations.minus(this, a); - } - - /** - * See {@link BasicOperations#intersection(Automaton, Automaton)}. - */ - public Automaton intersection(Automaton a) { - return BasicOperations.intersection(this, a); - } - - /** - * See {@link BasicOperations#subsetOf(Automaton, Automaton)}. - */ - public boolean subsetOf(Automaton a) { - return BasicOperations.subsetOf(this, a); - } - - /** - * See {@link BasicOperations#union(Automaton, Automaton)}. - */ - public Automaton union(Automaton a) { - return BasicOperations.union(this, a); - } - - /** - * See {@link BasicOperations#union(Collection)}. - */ - static public Automaton union(Collection l) { - return BasicOperations.union(l); - } - - /** - * See {@link BasicOperations#determinize(Automaton)}. - */ - public void determinize() { - BasicOperations.determinize(this); - } - - /** - * See {@link BasicOperations#isEmptyString(Automaton)}. - */ - public boolean isEmptyString() { - return BasicOperations.isEmptyString(this); - } - - /** - * See {@link MinimizationOperations#minimize(Automaton)}. Returns the - * automaton being given as argument. - */ - public static Automaton minimize(Automaton a) { - MinimizationOperations.minimize(a); - return a; - } } diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/BasicOperations.java b/lucene/core/src/java/org/apache/lucene/util/automaton/BasicOperations.java deleted file mode 100644 index 1d953585a7b0..000000000000 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/BasicOperations.java +++ /dev/null @@ -1,853 +0,0 @@ -/* - * dk.brics.automaton - * - * Copyright (c) 2001-2009 Anders Moeller - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -package org.apache.lucene.util.automaton; - -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.RamUsageEstimator; - -import java.util.ArrayList; -import java.util.BitSet; -import java.util.Collection; -import java.util.HashMap; -import java.util.HashSet; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.Set; - -/** - * Basic automata operations. - * - * @lucene.experimental - */ -final public class BasicOperations { - - private BasicOperations() {} - - /** - * Returns an automaton that accepts the concatenation of the languages of the - * given automata. - *

- * Complexity: linear in number of states. - */ - static public Automaton concatenate(Automaton a1, Automaton a2) { - if (a1.isSingleton() && a2.isSingleton()) return BasicAutomata - .makeString(a1.singleton + a2.singleton); - if (isEmpty(a1) || isEmpty(a2)) - return BasicAutomata.makeEmpty(); - // adding epsilon transitions with the NFA concatenation algorithm - // in this case always produces a resulting DFA, preventing expensive - // redundant determinize() calls for this common case. - boolean deterministic = a1.isSingleton() && a2.isDeterministic(); - if (a1 == a2) { - a1 = a1.cloneExpanded(); - a2 = a2.cloneExpanded(); - } else { - a1 = a1.cloneExpandedIfRequired(); - a2 = a2.cloneExpandedIfRequired(); - } - for (State s : a1.getAcceptStates()) { - s.accept = false; - s.addEpsilon(a2.initial); - } - a1.deterministic = deterministic; - //a1.clearHashCode(); - a1.clearNumberedStates(); - a1.checkMinimizeAlways(); - return a1; - } - - /** - * Returns an automaton that accepts the concatenation of the languages of the - * given automata. - *

- * Complexity: linear in total number of states. - */ - static public Automaton concatenate(List l) { - if (l.isEmpty()) return BasicAutomata.makeEmptyString(); - boolean all_singleton = true; - for (Automaton a : l) - if (!a.isSingleton()) { - all_singleton = false; - break; - } - if (all_singleton) { - StringBuilder b = new StringBuilder(); - for (Automaton a : l) - b.append(a.singleton); - return BasicAutomata.makeString(b.toString()); - } else { - for (Automaton a : l) - if (BasicOperations.isEmpty(a)) return BasicAutomata.makeEmpty(); - Set ids = new HashSet<>(); - for (Automaton a : l) - ids.add(System.identityHashCode(a)); - boolean has_aliases = ids.size() != l.size(); - Automaton b = l.get(0); - if (has_aliases) b = b.cloneExpanded(); - else b = b.cloneExpandedIfRequired(); - Set ac = b.getAcceptStates(); - boolean first = true; - for (Automaton a : l) - if (first) first = false; - else { - if (a.isEmptyString()) continue; - Automaton aa = a; - if (has_aliases) aa = aa.cloneExpanded(); - else aa = aa.cloneExpandedIfRequired(); - Set ns = aa.getAcceptStates(); - for (State s : ac) { - s.accept = false; - s.addEpsilon(aa.initial); - if (s.accept) ns.add(s); - } - ac = ns; - } - b.deterministic = false; - //b.clearHashCode(); - b.clearNumberedStates(); - b.checkMinimizeAlways(); - return b; - } - } - - /** - * Returns an automaton that accepts the union of the empty string and the - * language of the given automaton. - *

- * Complexity: linear in number of states. - */ - static public Automaton optional(Automaton a) { - a = a.cloneExpandedIfRequired(); - State s = new State(); - s.addEpsilon(a.initial); - s.accept = true; - a.initial = s; - a.deterministic = false; - //a.clearHashCode(); - a.clearNumberedStates(); - a.checkMinimizeAlways(); - return a; - } - - /** - * Returns an automaton that accepts the Kleene star (zero or more - * concatenated repetitions) of the language of the given automaton. Never - * modifies the input automaton language. - *

- * Complexity: linear in number of states. - */ - static public Automaton repeat(Automaton a) { - a = a.cloneExpanded(); - State s = new State(); - s.accept = true; - s.addEpsilon(a.initial); - for (State p : a.getAcceptStates()) - p.addEpsilon(s); - a.initial = s; - a.deterministic = false; - //a.clearHashCode(); - a.clearNumberedStates(); - a.checkMinimizeAlways(); - return a; - } - - /** - * Returns an automaton that accepts min or more concatenated - * repetitions of the language of the given automaton. - *

- * Complexity: linear in number of states and in min. - */ - static public Automaton repeat(Automaton a, int min) { - if (min == 0) return repeat(a); - List as = new ArrayList<>(); - while (min-- > 0) - as.add(a); - as.add(repeat(a)); - return concatenate(as); - } - - /** - * Returns an automaton that accepts between min and - * max (including both) concatenated repetitions of the language - * of the given automaton. - *

- * Complexity: linear in number of states and in min and - * max. - */ - static public Automaton repeat(Automaton a, int min, int max) { - if (min > max) return BasicAutomata.makeEmpty(); - max -= min; - a.expandSingleton(); - Automaton b; - if (min == 0) b = BasicAutomata.makeEmptyString(); - else if (min == 1) b = a.clone(); - else { - List as = new ArrayList<>(); - while (min-- > 0) - as.add(a); - b = concatenate(as); - } - if (max > 0) { - Automaton d = a.clone(); - while (--max > 0) { - Automaton c = a.clone(); - for (State p : c.getAcceptStates()) - p.addEpsilon(d.initial); - d = c; - } - for (State p : b.getAcceptStates()) - p.addEpsilon(d.initial); - b.deterministic = false; - //b.clearHashCode(); - b.clearNumberedStates(); - b.checkMinimizeAlways(); - } - return b; - } - - /** - * Returns a (deterministic) automaton that accepts the complement of the - * language of the given automaton. - *

- * Complexity: linear in number of states (if already deterministic). - */ - static public Automaton complement(Automaton a) { - a = a.cloneExpandedIfRequired(); - a.determinize(); - a.totalize(); - for (State p : a.getNumberedStates()) - p.accept = !p.accept; - a.removeDeadTransitions(); - return a; - } - - /** - * Returns a (deterministic) automaton that accepts the intersection of the - * language of a1 and the complement of the language of - * a2. As a side-effect, the automata may be determinized, if not - * already deterministic. - *

- * Complexity: quadratic in number of states (if already deterministic). - */ - static public Automaton minus(Automaton a1, Automaton a2) { - if (BasicOperations.isEmpty(a1) || a1 == a2) return BasicAutomata - .makeEmpty(); - if (BasicOperations.isEmpty(a2)) return a1.cloneIfRequired(); - if (a1.isSingleton()) { - if (BasicOperations.run(a2, a1.singleton)) return BasicAutomata.makeEmpty(); - else return a1.cloneIfRequired(); - } - return intersection(a1, a2.complement()); - } - - /** - * Returns an automaton that accepts the intersection of the languages of the - * given automata. Never modifies the input automata languages. - *

- * Complexity: quadratic in number of states. - */ - static public Automaton intersection(Automaton a1, Automaton a2) { - if (a1.isSingleton()) { - if (BasicOperations.run(a2, a1.singleton)) return a1.cloneIfRequired(); - else return BasicAutomata.makeEmpty(); - } - if (a2.isSingleton()) { - if (BasicOperations.run(a1, a2.singleton)) return a2.cloneIfRequired(); - else return BasicAutomata.makeEmpty(); - } - if (a1 == a2) return a1.cloneIfRequired(); - Transition[][] transitions1 = a1.getSortedTransitions(); - Transition[][] transitions2 = a2.getSortedTransitions(); - Automaton c = new Automaton(); - LinkedList worklist = new LinkedList<>(); - HashMap newstates = new HashMap<>(); - StatePair p = new StatePair(c.initial, a1.initial, a2.initial); - worklist.add(p); - newstates.put(p, p); - while (worklist.size() > 0) { - p = worklist.removeFirst(); - p.s.accept = p.s1.accept && p.s2.accept; - Transition[] t1 = transitions1[p.s1.number]; - Transition[] t2 = transitions2[p.s2.number]; - for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) { - while (b2 < t2.length && t2[b2].max < t1[n1].min) - b2++; - for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++) - if (t2[n2].max >= t1[n1].min) { - StatePair q = new StatePair(t1[n1].to, t2[n2].to); - StatePair r = newstates.get(q); - if (r == null) { - q.s = new State(); - worklist.add(q); - newstates.put(q, q); - r = q; - } - int min = t1[n1].min > t2[n2].min ? t1[n1].min : t2[n2].min; - int max = t1[n1].max < t2[n2].max ? t1[n1].max : t2[n2].max; - p.s.addTransition(new Transition(min, max, r.s)); - } - } - } - c.deterministic = a1.deterministic && a2.deterministic; - c.removeDeadTransitions(); - c.checkMinimizeAlways(); - return c; - } - - /** Returns true if these two automata accept exactly the - * same language. This is a costly computation! Note - * also that a1 and a2 will be determinized as a side - * effect. */ - public static boolean sameLanguage(Automaton a1, Automaton a2) { - if (a1 == a2) { - return true; - } - if (a1.isSingleton() && a2.isSingleton()) { - return a1.singleton.equals(a2.singleton); - } else if (a1.isSingleton()) { - // subsetOf is faster if the first automaton is a singleton - return subsetOf(a1, a2) && subsetOf(a2, a1); - } else { - return subsetOf(a2, a1) && subsetOf(a1, a2); - } - } - - /** - * Returns true if the language of a1 is a subset of the language - * of a2. As a side-effect, a2 is determinized if - * not already marked as deterministic. - *

- * Complexity: quadratic in number of states. - */ - public static boolean subsetOf(Automaton a1, Automaton a2) { - if (a1 == a2) return true; - if (a1.isSingleton()) { - if (a2.isSingleton()) return a1.singleton.equals(a2.singleton); - return BasicOperations.run(a2, a1.singleton); - } - a2.determinize(); - Transition[][] transitions1 = a1.getSortedTransitions(); - Transition[][] transitions2 = a2.getSortedTransitions(); - LinkedList worklist = new LinkedList<>(); - HashSet visited = new HashSet<>(); - StatePair p = new StatePair(a1.initial, a2.initial); - worklist.add(p); - visited.add(p); - while (worklist.size() > 0) { - p = worklist.removeFirst(); - if (p.s1.accept && !p.s2.accept) { - return false; - } - Transition[] t1 = transitions1[p.s1.number]; - Transition[] t2 = transitions2[p.s2.number]; - for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) { - while (b2 < t2.length && t2[b2].max < t1[n1].min) - b2++; - int min1 = t1[n1].min, max1 = t1[n1].max; - - for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++) { - if (t2[n2].min > min1) { - return false; - } - if (t2[n2].max < Character.MAX_CODE_POINT) min1 = t2[n2].max + 1; - else { - min1 = Character.MAX_CODE_POINT; - max1 = Character.MIN_CODE_POINT; - } - StatePair q = new StatePair(t1[n1].to, t2[n2].to); - if (!visited.contains(q)) { - worklist.add(q); - visited.add(q); - } - } - if (min1 <= max1) { - return false; - } - } - } - return true; - } - - /** - * Returns an automaton that accepts the union of the languages of the given - * automata. - *

- * Complexity: linear in number of states. - */ - public static Automaton union(Automaton a1, Automaton a2) { - if ((a1.isSingleton() && a2.isSingleton() && a1.singleton - .equals(a2.singleton)) - || a1 == a2) return a1.cloneIfRequired(); - if (a1 == a2) { - a1 = a1.cloneExpanded(); - a2 = a2.cloneExpanded(); - } else { - a1 = a1.cloneExpandedIfRequired(); - a2 = a2.cloneExpandedIfRequired(); - } - State s = new State(); - s.addEpsilon(a1.initial); - s.addEpsilon(a2.initial); - a1.initial = s; - a1.deterministic = false; - //a1.clearHashCode(); - a1.clearNumberedStates(); - a1.checkMinimizeAlways(); - return a1; - } - - /** - * Returns an automaton that accepts the union of the languages of the given - * automata. - *

- * Complexity: linear in number of states. - */ - public static Automaton union(Collection l) { - Set ids = new HashSet<>(); - for (Automaton a : l) - ids.add(System.identityHashCode(a)); - boolean has_aliases = ids.size() != l.size(); - State s = new State(); - for (Automaton b : l) { - if (BasicOperations.isEmpty(b)) continue; - Automaton bb = b; - if (has_aliases) bb = bb.cloneExpanded(); - else bb = bb.cloneExpandedIfRequired(); - s.addEpsilon(bb.initial); - } - Automaton a = new Automaton(); - a.initial = s; - a.deterministic = false; - //a.clearHashCode(); - a.clearNumberedStates(); - a.checkMinimizeAlways(); - return a; - } - - // Simple custom ArrayList - private final static class TransitionList { - Transition[] transitions = new Transition[2]; - int count; - - public void add(Transition t) { - if (transitions.length == count) { - Transition[] newArray = new Transition[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; - System.arraycopy(transitions, 0, newArray, 0, count); - transitions = newArray; - } - transitions[count++] = t; - } - } - - // Holds all transitions that start on this int point, or - // end at this point-1 - private final static class PointTransitions implements Comparable { - int point; - final TransitionList ends = new TransitionList(); - final TransitionList starts = new TransitionList(); - @Override - public int compareTo(PointTransitions other) { - return point - other.point; - } - - public void reset(int point) { - this.point = point; - ends.count = 0; - starts.count = 0; - } - - @Override - public boolean equals(Object other) { - return ((PointTransitions) other).point == point; - } - - @Override - public int hashCode() { - return point; - } - } - - private final static class PointTransitionSet { - int count; - PointTransitions[] points = new PointTransitions[5]; - - private final static int HASHMAP_CUTOVER = 30; - private final HashMap map = new HashMap<>(); - private boolean useHash = false; - - private PointTransitions next(int point) { - // 1st time we are seeing this point - if (count == points.length) { - final PointTransitions[] newArray = new PointTransitions[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; - System.arraycopy(points, 0, newArray, 0, count); - points = newArray; - } - PointTransitions points0 = points[count]; - if (points0 == null) { - points0 = points[count] = new PointTransitions(); - } - points0.reset(point); - count++; - return points0; - } - - private PointTransitions find(int point) { - if (useHash) { - final Integer pi = point; - PointTransitions p = map.get(pi); - if (p == null) { - p = next(point); - map.put(pi, p); - } - return p; - } else { - for(int i=0;i 1) ArrayUtil.timSort(points, 0, count); - } - - public void add(Transition t) { - find(t.min).starts.add(t); - find(1+t.max).ends.add(t); - } - - @Override - public String toString() { - StringBuilder s = new StringBuilder(); - for(int i=0;i 0) { - s.append(' '); - } - s.append(points[i].point).append(':').append(points[i].starts.count).append(',').append(points[i].ends.count); - } - return s.toString(); - } - } - - /** - * Determinizes the given automaton. - *

- * Worst case complexity: exponential in number of states. - */ - public static void determinize(Automaton a) { - if (a.deterministic || a.isSingleton()) { - return; - } - - final State[] allStates = a.getNumberedStates(); - - // subset construction - final boolean initAccept = a.initial.accept; - final int initNumber = a.initial.number; - a.initial = new State(); - SortedIntSet.FrozenIntSet initialset = new SortedIntSet.FrozenIntSet(initNumber, a.initial); - - LinkedList worklist = new LinkedList<>(); - Map newstate = new HashMap<>(); - - worklist.add(initialset); - - a.initial.accept = initAccept; - newstate.put(initialset, a.initial); - - int newStateUpto = 0; - State[] newStatesArray = new State[5]; - newStatesArray[newStateUpto] = a.initial; - a.initial.number = newStateUpto; - newStateUpto++; - - // like Set - final PointTransitionSet points = new PointTransitionSet(); - - // like SortedMap - final SortedIntSet statesSet = new SortedIntSet(5); - - while (worklist.size() > 0) { - SortedIntSet.FrozenIntSet s = worklist.removeFirst(); - - // Collate all outgoing transitions by min/1+max: - for(int i=0;i 0) { - assert lastPoint != -1; - - statesSet.computeHash(); - - State q = newstate.get(statesSet); - if (q == null) { - q = new State(); - final SortedIntSet.FrozenIntSet p = statesSet.freeze(q); - worklist.add(p); - if (newStateUpto == newStatesArray.length) { - final State[] newArray = new State[ArrayUtil.oversize(1+newStateUpto, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; - System.arraycopy(newStatesArray, 0, newArray, 0, newStateUpto); - newStatesArray = newArray; - } - newStatesArray[newStateUpto] = q; - q.number = newStateUpto; - newStateUpto++; - q.accept = accCount > 0; - newstate.put(p, q); - } else { - assert (accCount > 0 ? true:false) == q.accept: "accCount=" + accCount + " vs existing accept=" + q.accept + " states=" + statesSet; - } - - r.addTransition(new Transition(lastPoint, point-1, q)); - } - - // process transitions that end on this point - // (closes an overlapping interval) - Transition[] transitions = points.points[i].ends.transitions; - int limit = points.points[i].ends.count; - for(int j=0;j 0 && BasicOperations.sameLanguage(automaton, BasicAutomata.makeString(commonPrefix))) { - singleton = commonPrefix; - } else { - singleton = null; - } + + if (commonPrefix.length() > 0 && Operations.sameLanguage(automaton, Automata.makeString(commonPrefix))) { + singleton = commonPrefix; } else { - commonPrefix = null; - singleton = automaton.getSingleton(); + singleton = null; } - + if (singleton != null) { - // matches a fixed string in singleton or expanded - // representation + // matches a fixed string type = AUTOMATON_TYPE.SINGLE; term = new BytesRef(singleton); commonSuffixRef = null; runAutomaton = null; - sortedTransitions = null; - this.finite = null; - return; - } else if (BasicOperations.sameLanguage(automaton, BasicOperations.concatenate( - BasicAutomata.makeString(commonPrefix), BasicAutomata.makeAnyString()))) { - // matches a constant prefix - type = AUTOMATON_TYPE.PREFIX; - term = new BytesRef(commonPrefix); - commonSuffixRef = null; - runAutomaton = null; - sortedTransitions = null; + this.automaton = null; this.finite = null; return; + } else if (commonPrefix.length() > 0) { + Automaton other = Operations.concatenate(Automata.makeString(commonPrefix), Automata.makeAnyString()); + other = Operations.determinize(other); + assert Operations.hasDeadStates(other) == false; + if (Operations.sameLanguage(automaton, other)) { + // matches a constant prefix + type = AUTOMATON_TYPE.PREFIX; + term = new BytesRef(commonPrefix); + commonSuffixRef = null; + runAutomaton = null; + this.automaton = null; + this.finite = null; + return; + } } } } type = AUTOMATON_TYPE.NORMAL; term = null; + if (finite == null) { - this.finite = SpecialOperations.isFinite(automaton); + this.finite = Operations.isFinite(automaton); } else { this.finite = finite; } + Automaton utf8 = new UTF32ToUTF8().convert(automaton); if (this.finite) { commonSuffixRef = null; } else { - commonSuffixRef = SpecialOperations.getCommonSuffixBytesRef(utf8); + commonSuffixRef = Operations.getCommonSuffixBytesRef(utf8); } runAutomaton = new ByteRunAutomaton(utf8, true); - sortedTransitions = utf8.getSortedTransitions(); + + this.automaton = runAutomaton.automaton; } + + private Transition transition = new Transition(); //private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG; private BytesRef addTail(int state, BytesRef term, int idx, int leadLabel) { - + //System.out.println("addTail state=" + state + " term=" + term.utf8ToString() + " idx=" + idx + " leadLabel=" + (char) leadLabel); + //System.out.println(automaton.toDot()); // Find biggest transition that's < label // TODO: use binary search here - Transition maxTransition = null; - for (Transition transition : sortedTransitions[state]) { + int maxIndex = -1; + int numTransitions = automaton.initTransition(state, transition); + for(int i=0;i ").append(i).append("\n"); - } - for (int j = 0; j < sortedTransitions[i].length; j++) { - b.append(" ").append(i); - sortedTransitions[i][j].appendDot(b); - } - } - return b.append("}\n").toString(); - } @Override public int hashCode() { diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java b/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java index 68ce1e9a0517..f96b837b8935 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java @@ -29,7 +29,7 @@ * (nearly linear with the input size). * * @see #build(Collection) - * @see BasicAutomata#makeStringUnion(Collection) + * @see Automata#makeStringUnion(Collection) */ final class DaciukMihovAutomatonBuilder { /** @@ -249,20 +249,22 @@ public State complete() { /** * Internal recursive traversal for conversion. */ - private static org.apache.lucene.util.automaton.State convert(State s, - IdentityHashMap visited) { - org.apache.lucene.util.automaton.State converted = visited.get(s); - if (converted != null) return converted; + private static int convert(Automaton.Builder a, State s, + IdentityHashMap visited) { + + Integer converted = visited.get(s); + if (converted != null) { + return converted; + } - converted = new org.apache.lucene.util.automaton.State(); - converted.setAccept(s.is_final); + converted = a.createState(); + a.setAccept(converted, s.is_final); visited.put(s, converted); int i = 0; int[] labels = s.labels; for (DaciukMihovAutomatonBuilder.State target : s.states) { - converted.addTransition( - new Transition(labels[i++], convert(target, visited))); + a.addTransition(converted, convert(a, target, visited), labels[i++]); } return converted; @@ -281,12 +283,12 @@ public static Automaton build(Collection input) { builder.add(scratch); } - Automaton a = new Automaton(); - a.initial = convert( + Automaton.Builder a = new Automaton.Builder(); + convert(a, builder.complete(), - new IdentityHashMap()); - a.deterministic = true; - return a; + new IdentityHashMap()); + + return a.finish(); } /** diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java b/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java index 869dc3ad5515..01badf0fa99e 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java @@ -21,6 +21,8 @@ import java.util.SortedSet; import java.util.TreeSet; +import org.apache.lucene.util.UnicodeUtil; + /** * Class to construct DFAs that match a word within some edit distance. *

@@ -30,7 +32,8 @@ * @lucene.experimental */ public class LevenshteinAutomata { - /** @lucene.internal */ + /** Maximum edit distance this class can generate an automaton for. + * @lucene.internal */ public static final int MAXIMUM_SUPPORTED_DISTANCE = 2; /* input word */ final int word[]; @@ -112,7 +115,7 @@ private static int[] codePoints(String input) { } return word; } - + /** * Compute a DFA that accepts all strings within an edit distance of n. *

@@ -125,8 +128,25 @@ private static int[] codePoints(String input) { *

*/ public Automaton toAutomaton(int n) { + return toAutomaton(n, ""); + } + + /** + * Compute a DFA that accepts all strings within an edit distance of n, + * matching the specified exact prefix. + *

+ * All automata have the following properties: + *

    + *
  • They are deterministic (DFA). + *
  • There are no transitions to dead states. + *
  • They are not minimal (some transitions could be combined). + *
+ *

+ */ + public Automaton toAutomaton(int n, String prefix) { + assert prefix != null; if (n == 0) { - return BasicAutomata.makeString(word, 0, word.length); + return Automata.makeString(prefix + UnicodeUtil.newString(word, 0, word.length)); } if (n >= descriptions.length) @@ -135,15 +155,36 @@ public Automaton toAutomaton(int n) { final int range = 2*n+1; ParametricDescription description = descriptions[n]; // the number of states is based on the length of the word and n - State states[] = new State[description.size()]; + int numStates = description.size(); + + Automaton a = new Automaton(); + int lastState; + if (prefix != null) { + // Insert prefix + lastState = a.createState(); + for (int i = 0, cp = 0; i < prefix.length(); i += Character.charCount(cp)) { + int state = a.createState(); + cp = prefix.codePointAt(i); + a.addTransition(lastState, state, cp, cp); + lastState = state; + } + } else { + lastState = a.createState(); + } + + int stateOffset = lastState; + a.setAccept(lastState, description.isAccept(0)); + // create all states, and mark as accept states if appropriate - for (int i = 0; i < states.length; i++) { - states[i] = new State(); - states[i].number = i; - states[i].setAccept(description.isAccept(i)); + for (int i = 1; i < numStates; i++) { + int state = a.createState(); + a.setAccept(state, description.isAccept(i)); } + + // TODO: this creates bogus states/transitions (states are final, have self loops, and can't be reached from an init state) + // create transitions from state to state - for (int k = 0; k < states.length; k++) { + for (int k = 0; k < numStates; k++) { final int xpos = description.getPosition(k); if (xpos < 0) continue; @@ -154,31 +195,26 @@ public Automaton toAutomaton(int n) { // get the characteristic vector at this position wrt ch final int cvec = getVector(ch, xpos, end); int dest = description.transition(k, xpos, cvec); - if (dest >= 0) - states[k].addTransition(new Transition(ch, states[dest])); + if (dest >= 0) { + a.addTransition(stateOffset+k, stateOffset+dest, ch); + } } // add transitions for all other chars in unicode // by definition, their characteristic vectors are always 0, // because they do not exist in the input string. int dest = description.transition(k, xpos, 0); // by definition - if (dest >= 0) - for (int r = 0; r < numRanges; r++) - states[k].addTransition(new Transition(rangeLower[r], rangeUpper[r], states[dest])); + if (dest >= 0) { + for (int r = 0; r < numRanges; r++) { + a.addTransition(stateOffset+k, stateOffset+dest, rangeLower[r], rangeUpper[r]); + } + } } - Automaton a = new Automaton(states[0]); - a.setDeterministic(true); - // we create some useless unconnected states, and its a net-win overall to remove these, - // as well as to combine any adjacent transitions (it makes later algorithms more efficient). - // so, while we could set our numberedStates here, its actually best not to, and instead to - // force a traversal in reduce, pruning the unconnected states while we combine adjacent transitions. - //a.setNumberedStates(states); - a.reduce(); - // we need not trim transitions to dead states, as they are not created. - //a.restoreInvariant(); + a.finishState(); + assert a.isDeterministic(); return a; } - + /** * Get the characteristic vector X(x, V) * where V is substring(pos, end) diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java b/lucene/core/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java index 85f8d58762df..223b25b730fd 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java @@ -46,40 +46,43 @@ private MinimizationOperations() {} /** * Minimizes (and determinizes if not already deterministic) the given * automaton. - * - * @see Automaton#setMinimization(int) */ - public static void minimize(Automaton a) { - if (!a.isSingleton()) { - minimizeHopcroft(a); - } - // recompute hash code - //a.hash_code = 1a.getNumberOfStates() * 3 + a.getNumberOfTransitions() * 2; - //if (a.hash_code == 0) a.hash_code = 1; + public static Automaton minimize(Automaton a) { + return minimizeHopcroft(a); } /** * Minimizes the given automaton using Hopcroft's algorithm. */ - public static void minimizeHopcroft(Automaton a) { - a.determinize(); - if (a.initial.numTransitions == 1) { - Transition t = a.initial.transitionsArray[0]; - if (t.to == a.initial && t.min == Character.MIN_CODE_POINT - && t.max == Character.MAX_CODE_POINT) return; + public static Automaton minimizeHopcroft(Automaton a) { + if (a.getNumStates() == 0 || (a.isAccept(0) == false && a.getNumTransitions(0) == 0)) { + // Fastmatch for common case + return new Automaton(); + } + a = Operations.determinize(a); + //a.writeDot("adet"); + if (a.getNumTransitions(0) == 1) { + Transition t = new Transition(); + a.getTransition(0, 0, t); + if (t.dest == 0 && t.min == Character.MIN_CODE_POINT + && t.max == Character.MAX_CODE_POINT) { + // Accepts all strings + return a; + } } - a.totalize(); + a = Operations.totalize(a); + //a.writeDot("atot"); // initialize data structures final int[] sigma = a.getStartPoints(); - final State[] states = a.getNumberedStates(); - final int sigmaLen = sigma.length, statesLen = states.length; - @SuppressWarnings({"rawtypes","unchecked"}) final ArrayList[][] reverse = - (ArrayList[][]) new ArrayList[statesLen][sigmaLen]; - @SuppressWarnings({"rawtypes","unchecked"}) final HashSet[] partition = - (HashSet[]) new HashSet[statesLen]; - @SuppressWarnings({"rawtypes","unchecked"}) final ArrayList[] splitblock = - (ArrayList[]) new ArrayList[statesLen]; + final int sigmaLen = sigma.length, statesLen = a.getNumStates(); + + @SuppressWarnings({"rawtypes","unchecked"}) final ArrayList[][] reverse = + (ArrayList[][]) new ArrayList[statesLen][sigmaLen]; + @SuppressWarnings({"rawtypes","unchecked"}) final HashSet[] partition = + (HashSet[]) new HashSet[statesLen]; + @SuppressWarnings({"rawtypes","unchecked"}) final ArrayList[] splitblock = + (ArrayList[]) new ArrayList[statesLen]; final int[] block = new int[statesLen]; final StateList[][] active = new StateList[statesLen][sigmaLen]; final StateListNode[][] active2 = new StateListNode[statesLen][sigmaLen]; @@ -96,71 +99,78 @@ public static void minimizeHopcroft(Automaton a) { } // find initial partition and reverse edges for (int q = 0; q < statesLen; q++) { - final State qq = states[q]; - final int j = qq.accept ? 0 : 1; - partition[j].add(qq); + final int j = a.isAccept(q) ? 0 : 1; + partition[j].add(q); block[q] = j; for (int x = 0; x < sigmaLen; x++) { - final ArrayList[] r = - reverse[qq.step(sigma[x]).number]; - if (r[x] == null) + final ArrayList[] r = reverse[a.step(q, sigma[x])]; + if (r[x] == null) { r[x] = new ArrayList<>(); - r[x].add(qq); + } + r[x].add(q); } } // initialize active sets for (int j = 0; j <= 1; j++) { for (int x = 0; x < sigmaLen; x++) { - for (final State qq : partition[j]) { - if (reverse[qq.number][x] != null) - active2[qq.number][x] = active[j][x].add(qq); + for (int q : partition[j]) { + if (reverse[q][x] != null) { + active2[q][x] = active[j][x].add(q); + } } } } + // initialize pending for (int x = 0; x < sigmaLen; x++) { final int j = (active[0][x].size <= active[1][x].size) ? 0 : 1; pending.add(new IntPair(j, x)); pending2.set(x*statesLen + j); } + // process pending until fixed point int k = 2; + //System.out.println("start min"); while (!pending.isEmpty()) { + //System.out.println(" cycle pending"); final IntPair ip = pending.removeFirst(); final int p = ip.n1; final int x = ip.n2; + //System.out.println(" pop n1=" + ip.n1 + " n2=" + ip.n2); pending2.clear(x*statesLen + p); // find states that need to be split off their blocks for (StateListNode m = active[p][x].first; m != null; m = m.next) { - final ArrayList r = reverse[m.q.number][x]; - if (r != null) for (final State s : r) { - final int i = s.number; - if (!split.get(i)) { - split.set(i); - final int j = block[i]; - splitblock[j].add(s); - if (!refine2.get(j)) { - refine2.set(j); - refine.set(j); + final ArrayList r = reverse[m.q][x]; + if (r != null) { + for (int i : r) { + if (!split.get(i)) { + split.set(i); + final int j = block[i]; + splitblock[j].add(i); + if (!refine2.get(j)) { + refine2.set(j); + refine.set(j); + } } } } } + // refine blocks for (int j = refine.nextSetBit(0); j >= 0; j = refine.nextSetBit(j+1)) { - final ArrayList sb = splitblock[j]; + final ArrayList sb = splitblock[j]; if (sb.size() < partition[j].size()) { - final HashSet b1 = partition[j]; - final HashSet b2 = partition[k]; - for (final State s : sb) { + final HashSet b1 = partition[j]; + final HashSet b2 = partition[k]; + for (int s : sb) { b1.remove(s); b2.add(s); - block[s.number] = k; + block[s] = k; for (int c = 0; c < sigmaLen; c++) { - final StateListNode sn = active2[s.number][c]; + final StateListNode sn = active2[s][c]; if (sn != null && sn.sl == active[j][c]) { sn.remove(); - active2[s.number][c] = active[k][c].add(s); + active2[s][c] = active[k][c].add(s); } } } @@ -180,33 +190,69 @@ public static void minimizeHopcroft(Automaton a) { k++; } refine2.clear(j); - for (final State s : sb) - split.clear(s.number); + for (int s : sb) { + split.clear(s); + } sb.clear(); } refine.clear(); } + + Automaton result = new Automaton(); + + Transition t = new Transition(); + + //System.out.println(" k=" + k); + // make a new state for each equivalence class, set initial state - State[] newstates = new State[k]; - for (int n = 0; n < newstates.length; n++) { - final State s = new State(); - newstates[n] = s; - for (State q : partition[n]) { - if (q == a.initial) a.initial = s; - s.accept = q.accept; - s.number = q.number; // select representative - q.number = n; + int[] stateMap = new int[statesLen]; + int[] stateRep = new int[k]; + + result.createState(); + + //System.out.println("min: k=" + k); + for (int n = 0; n < k; n++) { + //System.out.println(" n=" + n); + + boolean isInitial = false; + for (int q : partition[n]) { + if (q == 0) { + isInitial = true; + //System.out.println(" isInitial!"); + break; + } + } + + int newState; + if (isInitial) { + newState = 0; + } else { + newState = result.createState(); + } + + //System.out.println(" newState=" + newState); + + for (int q : partition[n]) { + stateMap[q] = newState; + //System.out.println(" q=" + q + " isAccept?=" + a.isAccept(q)); + result.setAccept(newState, a.isAccept(q)); + stateRep[newState] = q; // select representative } } + // build transitions and set acceptance - for (int n = 0; n < newstates.length; n++) { - final State s = newstates[n]; - s.accept = states[s.number].accept; - for (Transition t : states[s.number].getTransitions()) - s.addTransition(new Transition(t.min, t.max, newstates[t.to.number])); + for (int n = 0; n < k; n++) { + int numTransitions = a.initTransition(stateRep[n], t); + for(int i=0;i + * Complexity: linear in total number of states. + */ + static public Automaton concatenate(Automaton a1, Automaton a2) { + return concatenate(Arrays.asList(a1, a2)); + } + + /** + * Returns an automaton that accepts the concatenation of the languages of the + * given automata. + *

+ * Complexity: linear in total number of states. + */ + static public Automaton concatenate(List l) { + Automaton result = new Automaton(); + + // First pass: create all states + for(Automaton a : l) { + if (a.getNumStates() == 0) { + result.finishState(); + return result; + } + int numStates = a.getNumStates(); + for(int s=0;s + * Complexity: linear in number of states. + */ + static public Automaton optional(Automaton a) { + Automaton result = new Automaton(); + result.createState(); + result.setAccept(0, true); + if (a.getNumStates() > 0) { + result.copy(a); + result.addEpsilon(0, 1); + } + result.finishState(); + return result; + } + + /** + * Returns an automaton that accepts the Kleene star (zero or more + * concatenated repetitions) of the language of the given automaton. Never + * modifies the input automaton language. + *

+ * Complexity: linear in number of states. + */ + static public Automaton repeat(Automaton a) { + Automaton.Builder builder = new Automaton.Builder(); + builder.createState(); + builder.setAccept(0, true); + builder.copy(a); + + Transition t = new Transition(); + int count = a.initTransition(0, t); + for(int i=0;imin or more concatenated + * repetitions of the language of the given automaton. + *

+ * Complexity: linear in number of states and in min. + */ + static public Automaton repeat(Automaton a, int min) { + if (min == 0) { + return repeat(a); + } + List as = new ArrayList<>(); + while (min-- > 0) { + as.add(a); + } + as.add(repeat(a)); + return concatenate(as); + } + + /** + * Returns an automaton that accepts between min and + * max (including both) concatenated repetitions of the language + * of the given automaton. + *

+ * Complexity: linear in number of states and in min and + * max. + */ + static public Automaton repeat(Automaton a, int min, int max) { + if (min > max) { + return Automata.makeEmpty(); + } + + Automaton b; + if (min == 0) { + b = Automata.makeEmptyString(); + } else if (min == 1) { + b = new Automaton(); + b.copy(a); + } else { + List as = new ArrayList<>(); + for(int i=0;i prevAcceptStates = toSet(b, 0); + + for(int i=min;i toSet(Automaton a, int offset) { + int numStates = a.getNumStates(); + BitSet isAccept = a.getAcceptStates(); + Set result = new HashSet(); + int upto = 0; + while (upto < numStates && (upto = isAccept.nextSetBit(upto)) != -1) { + result.add(offset+upto); + upto++; + } + + return result; + } + + /** + * Returns a (deterministic) automaton that accepts the complement of the + * language of the given automaton. + *

+ * Complexity: linear in number of states (if already deterministic). + */ + static public Automaton complement(Automaton a) { + a = totalize(determinize(a)); + int numStates = a.getNumStates(); + for (int p=0;pa1 and the complement of the language of + * a2. As a side-effect, the automata may be determinized, if not + * already deterministic. + *

+ * Complexity: quadratic in number of states (if already deterministic). + */ + static public Automaton minus(Automaton a1, Automaton a2) { + if (Operations.isEmpty(a1) || a1 == a2) { + return Automata.makeEmpty(); + } + if (Operations.isEmpty(a2)) { + return a1; + } + return intersection(a1, complement(a2)); + } + + /** + * Returns an automaton that accepts the intersection of the languages of the + * given automata. Never modifies the input automata languages. + *

+ * Complexity: quadratic in number of states. + */ + static public Automaton intersection(Automaton a1, Automaton a2) { + if (a1 == a2) { + return a1; + } + if (a1.getNumStates() == 0) { + return a1; + } + if (a2.getNumStates() == 0) { + return a2; + } + Transition[][] transitions1 = a1.getSortedTransitions(); + Transition[][] transitions2 = a2.getSortedTransitions(); + Automaton c = new Automaton(); + c.createState(); + LinkedList worklist = new LinkedList<>(); + HashMap newstates = new HashMap<>(); + StatePair p = new StatePair(0, 0, 0); + worklist.add(p); + newstates.put(p, p); + while (worklist.size() > 0) { + p = worklist.removeFirst(); + c.setAccept(p.s, a1.isAccept(p.s1) && a2.isAccept(p.s2)); + Transition[] t1 = transitions1[p.s1]; + Transition[] t2 = transitions2[p.s2]; + for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) { + while (b2 < t2.length && t2[b2].max < t1[n1].min) + b2++; + for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++) + if (t2[n2].max >= t1[n1].min) { + StatePair q = new StatePair(t1[n1].dest, t2[n2].dest); + StatePair r = newstates.get(q); + if (r == null) { + q.s = c.createState(); + worklist.add(q); + newstates.put(q, q); + r = q; + } + int min = t1[n1].min > t2[n2].min ? t1[n1].min : t2[n2].min; + int max = t1[n1].max < t2[n2].max ? t1[n1].max : t2[n2].max; + c.addTransition(p.s, r.s, min, max); + } + } + } + c.finishState(); + + return removeDeadStates(c); + } + + /** Returns true if these two automata accept exactly the + * same language. This is a costly computation! Note + * also that a1 and a2 will be determinized as a side + * effect. Both automata must be determinized and have + * no dead states! */ + public static boolean sameLanguage(Automaton a1, Automaton a2) { + if (a1 == a2) { + return true; + } + return subsetOf(a2, a1) && subsetOf(a1, a2); + } + + // TODO: move to test-framework? + /** Returns true if this automaton has any states that cannot + * be reached from the initial state or cannot reach an accept state. + * Cost is O(numTransitions+numStates). */ + public static boolean hasDeadStates(Automaton a) { + BitSet liveStates = getLiveStates(a); + int numLive = liveStates.cardinality(); + int numStates = a.getNumStates(); + assert numLive <= numStates: "numLive=" + numLive + " numStates=" + numStates + " " + liveStates; + return numLive < numStates; + } + + // TODO: move to test-framework? + /** Returns true if there are dead states reachable from an initial state. */ + public static boolean hasDeadStatesFromInitial(Automaton a) { + BitSet reachableFromInitial = getLiveStatesFromInitial(a); + BitSet reachableFromAccept = getLiveStatesToAccept(a); + reachableFromInitial.andNot(reachableFromAccept); + return reachableFromInitial.isEmpty() == false; + } + + // TODO: move to test-framework? + /** Returns true if there are dead states that reach an accept state. */ + public static boolean hasDeadStatesToAccept(Automaton a) { + BitSet reachableFromInitial = getLiveStatesFromInitial(a); + BitSet reachableFromAccept = getLiveStatesToAccept(a); + reachableFromAccept.andNot(reachableFromInitial); + return reachableFromAccept.isEmpty() == false; + } + + /** + * Returns true if the language of a1 is a subset of the language + * of a2. Both automata must be determinized and must have no dead + * states. + *

+ * Complexity: quadratic in number of states. + */ + public static boolean subsetOf(Automaton a1, Automaton a2) { + if (a1.isDeterministic() == false) { + throw new IllegalArgumentException("a1 must be deterministic"); + } + if (a2.isDeterministic() == false) { + throw new IllegalArgumentException("a2 must be deterministic"); + } + assert hasDeadStatesFromInitial(a1) == false; + assert hasDeadStatesFromInitial(a2) == false; + if (a1.getNumStates() == 0) { + // Empty language is alwyas a subset of any other language + return true; + } else if (a2.getNumStates() == 0) { + return isEmpty(a1); + } + + // TODO: cutover to iterators instead + Transition[][] transitions1 = a1.getSortedTransitions(); + Transition[][] transitions2 = a2.getSortedTransitions(); + LinkedList worklist = new LinkedList<>(); + HashSet visited = new HashSet<>(); + StatePair p = new StatePair(0, 0); + worklist.add(p); + visited.add(p); + while (worklist.size() > 0) { + p = worklist.removeFirst(); + if (a1.isAccept(p.s1) && a2.isAccept(p.s2) == false) { + return false; + } + Transition[] t1 = transitions1[p.s1]; + Transition[] t2 = transitions2[p.s2]; + for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) { + while (b2 < t2.length && t2[b2].max < t1[n1].min) { + b2++; + } + int min1 = t1[n1].min, max1 = t1[n1].max; + + for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++) { + if (t2[n2].min > min1) { + return false; + } + if (t2[n2].max < Character.MAX_CODE_POINT) { + min1 = t2[n2].max + 1; + } else { + min1 = Character.MAX_CODE_POINT; + max1 = Character.MIN_CODE_POINT; + } + StatePair q = new StatePair(t1[n1].dest, t2[n2].dest); + if (!visited.contains(q)) { + worklist.add(q); + visited.add(q); + } + } + if (min1 <= max1) { + return false; + } + } + } + return true; + } + + /** + * Returns an automaton that accepts the union of the languages of the given + * automata. + *

+ * Complexity: linear in number of states. + */ + public static Automaton union(Automaton a1, Automaton a2) { + return union(Arrays.asList(a1, a2)); + } + + /** + * Returns an automaton that accepts the union of the languages of the given + * automata. + *

+ * Complexity: linear in number of states. + */ + public static Automaton union(Collection l) { + Automaton result = new Automaton(); + + // Create initial state: + result.createState(); + + // Copy over all automata + Transition t = new Transition(); + for(Automaton a : l) { + result.copy(a); + } + + // Add epsilon transition from new initial state + int stateOffset = 1; + for(Automaton a : l) { + if (a.getNumStates() == 0) { + continue; + } + result.addEpsilon(0, stateOffset); + stateOffset += a.getNumStates(); + } + + result.finishState(); + + return result; + } + + // Simple custom ArrayList + private final static class TransitionList { + // dest, min, max + int[] transitions = new int[3]; + int next; + + public void add(Transition t) { + if (transitions.length < next+3) { + transitions = ArrayUtil.grow(transitions, next+3); + } + transitions[next] = t.dest; + transitions[next+1] = t.min; + transitions[next+2] = t.max; + next += 3; + } + } + + // Holds all transitions that start on this int point, or + // end at this point-1 + private final static class PointTransitions implements Comparable { + int point; + final TransitionList ends = new TransitionList(); + final TransitionList starts = new TransitionList(); + + @Override + public int compareTo(PointTransitions other) { + return point - other.point; + } + + public void reset(int point) { + this.point = point; + ends.next = 0; + starts.next = 0; + } + + @Override + public boolean equals(Object other) { + return ((PointTransitions) other).point == point; + } + + @Override + public int hashCode() { + return point; + } + } + + private final static class PointTransitionSet { + int count; + PointTransitions[] points = new PointTransitions[5]; + + private final static int HASHMAP_CUTOVER = 30; + private final HashMap map = new HashMap<>(); + private boolean useHash = false; + + private PointTransitions next(int point) { + // 1st time we are seeing this point + if (count == points.length) { + final PointTransitions[] newArray = new PointTransitions[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(points, 0, newArray, 0, count); + points = newArray; + } + PointTransitions points0 = points[count]; + if (points0 == null) { + points0 = points[count] = new PointTransitions(); + } + points0.reset(point); + count++; + return points0; + } + + private PointTransitions find(int point) { + if (useHash) { + final Integer pi = point; + PointTransitions p = map.get(pi); + if (p == null) { + p = next(point); + map.put(pi, p); + } + return p; + } else { + for(int i=0;i 1) ArrayUtil.timSort(points, 0, count); + } + + public void add(Transition t) { + find(t.min).starts.add(t); + find(1+t.max).ends.add(t); + } + + @Override + public String toString() { + StringBuilder s = new StringBuilder(); + for(int i=0;i 0) { + s.append(' '); + } + s.append(points[i].point).append(':').append(points[i].starts.next/3).append(',').append(points[i].ends.next/3); + } + return s.toString(); + } + } + + /** + * Determinizes the given automaton. + *

+ * Worst case complexity: exponential in number of states. + */ + public static Automaton determinize(Automaton a) { + if (a.isDeterministic()) { + // Already determinized + return a; + } + if (a.getNumStates() <= 1) { + // Already determinized + return a; + } + + // subset construction + Automaton.Builder b = new Automaton.Builder(); + + //System.out.println("DET:"); + //a.writeDot("/l/la/lucene/core/detin.dot"); + + SortedIntSet.FrozenIntSet initialset = new SortedIntSet.FrozenIntSet(0, 0); + + // Create state 0: + b.createState(); + + LinkedList worklist = new LinkedList<>(); + Map newstate = new HashMap<>(); + + worklist.add(initialset); + + b.setAccept(0, a.isAccept(0)); + newstate.put(initialset, 0); + + int newStateUpto = 0; + int[] newStatesArray = new int[5]; + newStatesArray[newStateUpto] = 0; + newStateUpto++; + + // like Set + final PointTransitionSet points = new PointTransitionSet(); + + // like SortedMap + final SortedIntSet statesSet = new SortedIntSet(5); + + Transition t = new Transition(); + + while (worklist.size() > 0) { + SortedIntSet.FrozenIntSet s = worklist.removeFirst(); + //System.out.println("det: pop set=" + s); + + // Collate all outgoing transitions by min/1+max: + for(int i=0;i 0) { + assert lastPoint != -1; + + statesSet.computeHash(); + + Integer q = newstate.get(statesSet); + if (q == null) { + q = b.createState(); + final SortedIntSet.FrozenIntSet p = statesSet.freeze(q); + //System.out.println(" make new state=" + q + " -> " + p + " accCount=" + accCount); + worklist.add(p); + b.setAccept(q, accCount > 0); + newstate.put(p, q); + } else { + assert (accCount > 0 ? true:false) == b.isAccept(q): "accCount=" + accCount + " vs existing accept=" + + b.isAccept(q) + " states=" + statesSet; + } + + // System.out.println(" add trans src=" + r + " dest=" + q + " min=" + lastPoint + " max=" + (point-1)); + + b.addTransition(r, q, lastPoint, point-1); + } + + // process transitions that end on this point + // (closes an overlapping interval) + int[] transitions = points.points[i].ends.transitions; + int limit = points.points[i].ends.next; + for(int j=0;j workList = new LinkedList<>(); + live.set(0); + workList.add(0); + + Transition t = new Transition(); + while (workList.isEmpty() == false) { + int s = workList.removeFirst(); + int count = a.initTransition(s, t); + for(int i=0;i workList = new LinkedList<>(); + BitSet live = new BitSet(numStates); + BitSet acceptBits = a.getAcceptStates(); + int s = 0; + while (s < numStates && (s = acceptBits.nextSetBit(s)) != -1) { + live.set(s); + workList.add(s); + s++; + } + + while (workList.isEmpty() == false) { + s = workList.removeFirst(); + int count = a2.initTransition(s, t); + for(int i=0;i visited = new HashSet<>(); + int s = 0; + boolean done; + Transition t = new Transition(); + do { + done = true; + visited.add(s); + if (a.isAccept(s) == false && a.getNumTransitions(s) == 1) { + a.getTransition(s, 0, t); + if (t.min == t.max && !visited.contains(t.dest)) { + b.appendCodePoint(t.min); + s = t.dest; + done = false; + } + } + } while (!done); + + return b.toString(); + } + + // TODO: this currently requites a determinized machine, + // but it need not -- we can speed it up by walking the + // NFA instead. it'd still be fail fast. + /** + * Returns the longest BytesRef that is a prefix of all accepted strings and + * visits each state at most once. The automaton must be deterministic. + * + * @return common prefix + */ + public static BytesRef getCommonPrefixBytesRef(Automaton a) { + BytesRef ref = new BytesRef(10); + HashSet visited = new HashSet<>(); + int s = 0; + boolean done; + Transition t = new Transition(); + do { + done = true; + visited.add(s); + if (a.isAccept(s) == false && a.getNumTransitions(s) == 1) { + a.getTransition(s, 0, t); + if (t.min == t.max && !visited.contains(t.dest)) { + ref.grow(++ref.length); + ref.bytes[ref.length - 1] = (byte) t.min; + s = t.dest; + done = false; + } + } + } while (!done); + + return ref; + } + + /** + * Returns the longest BytesRef that is a suffix of all accepted strings. + * Worst case complexity: exponential in number of states (this calls + * determinize). + * + * @return common suffix + */ + public static BytesRef getCommonSuffixBytesRef(Automaton a) { + // reverse the language of the automaton, then reverse its common prefix. + Automaton r = Operations.determinize(reverse(a)); + BytesRef ref = getCommonPrefixBytesRef(r); + reverseBytes(ref); + return ref; + } + + private static void reverseBytes(BytesRef ref) { + if (ref.length <= 1) return; + int num = ref.length >> 1; + for (int i = ref.offset; i < ( ref.offset + num ); i++) { + byte b = ref.bytes[i]; + ref.bytes[i] = ref.bytes[ref.offset * 2 + ref.length - i - 1]; + ref.bytes[ref.offset * 2 + ref.length - i - 1] = b; + } + } + + /** Returns an automaton accepting the reverse language. */ + public static Automaton reverse(Automaton a) { + return reverse(a, null); + } + + /** Reverses the automaton, returning the new initial states. */ + static Automaton reverse(Automaton a, Set initialStates) { + + if (Operations.isEmpty(a)) { + return new Automaton(); + } + + int numStates = a.getNumStates(); + + // Build a new automaton with all edges reversed + Automaton.Builder builder = new Automaton.Builder(); + + // Initial node; we'll add epsilon transitions in the end: + builder.createState(); + + for(int s=0;s t.max) { + // We've exhaused the current transition's labels; + // move to next transitions: + transition++; + if (transition >= a.getNumTransitions(state)) { + // We're done iterating transitions leaving this state + return -1; + } + a.getTransition(state, transition, t); + label = t.min; + to = t.dest; + } + return label++; + } + } + + private static PathNode getNode(PathNode[] nodes, int index) { + assert index < nodes.length; + if (nodes[index] == null) { + nodes[index] = new PathNode(); + } + return nodes[index]; + } + + // TODO: this is a dangerous method ... Automaton could be + // huge ... and it's better in general for caller to + // enumerate & process in a single walk: + + /** Returns the set of accepted strings, up to at most + * limit strings. If more than limit + * strings are accepted, the first limit strings found are returned. If limit == -1, then + * the limit is infinite. If the {@link Automaton} has + * cycles then this method might throw {@code + * IllegalArgumentException} but that is not guaranteed + * when the limit is set. */ + public static Set getFiniteStrings(Automaton a, int limit) { + Set results = new HashSet<>(); + + if (limit == -1 || limit > 0) { + // OK + } else { + throw new IllegalArgumentException("limit must be -1 (which means no limit), or > 0; got: " + limit); + } + + if (a.isAccept(0)) { + // Special case the empty string, as usual: + results.add(new IntsRef()); + } + + if (a.getNumTransitions(0) > 0 && (limit == -1 || results.size() < limit)) { + + int numStates = a.getNumStates(); + + // Tracks which states are in the current path, for + // cycle detection: + BitSet pathStates = new BitSet(numStates); + + // Stack to hold our current state in the + // recursion/iteration: + PathNode[] nodes = new PathNode[4]; + + pathStates.set(0); + PathNode root = getNode(nodes, 0); + root.resetState(a, 0); + + IntsRef string = new IntsRef(1); + string.length = 1; + + while (string.length > 0) { + + PathNode node = nodes[string.length-1]; + + // Get next label leaving the current node: + int label = node.nextLabel(a); + + if (label != -1) { + string.ints[string.length-1] = label; + + if (a.isAccept(node.to)) { + // This transition leads to an accept state, + // so we save the current string: + results.add(IntsRef.deepCopyOf(string)); + if (results.size() == limit) { + break; + } + } + + if (a.getNumTransitions(node.to) != 0) { + // Now recurse: the destination of this transition has + // outgoing transitions: + if (pathStates.get(node.to)) { + throw new IllegalArgumentException("automaton has cycles"); + } + pathStates.set(node.to); + + // Push node onto stack: + if (nodes.length == string.length) { + PathNode[] newNodes = new PathNode[ArrayUtil.oversize(nodes.length+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(nodes, 0, newNodes, 0, nodes.length); + nodes = newNodes; + } + getNode(nodes, string.length).resetState(a, node.to); + string.length++; + string.grow(string.length); + } + } else { + // No more transitions leaving this state, + // pop/return back to previous state: + assert pathStates.get(node.state); + pathStates.clear(node.state); + string.length--; + } + } + } + + return results; + } + + /** Returns a new automaton accepting the same language with added + * transitions to a dead state so that from every state and every label + * there is a transition. */ + static Automaton totalize(Automaton a) { + Automaton result = new Automaton(); + int numStates = a.getNumStates(); + for(int i=0;i maxi) { + result.addTransition(i, deadState, maxi, t.min-1); + } + if (t.max + 1 > maxi) { + maxi = t.max + 1; + } + } + + if (maxi <= Character.MAX_CODE_POINT) { + result.addTransition(i, deadState, maxi, Character.MAX_CODE_POINT); + } + } + + result.finishState(); + return result; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java index bf5b4be42064..76a040a620ae 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java @@ -361,8 +361,6 @@ enum Kind { */ public static final int NONE = 0x0000; - private static boolean allow_mutation = false; - Kind kind; RegExp exp1, exp2; String s; @@ -419,13 +417,13 @@ public RegExp(String s, int syntax_flags) throws IllegalArgumentException { to = e.to; b = null; } - + /** * Constructs new Automaton from this RegExp. Same * as toAutomaton(null) (empty automaton map). */ public Automaton toAutomaton() { - return toAutomatonAllowMutate(null, null); + return toAutomaton(null, null); } /** @@ -439,7 +437,7 @@ public Automaton toAutomaton() { */ public Automaton toAutomaton(AutomatonProvider automaton_provider) throws IllegalArgumentException { - return toAutomatonAllowMutate(null, automaton_provider); + return toAutomaton(null, automaton_provider); } /** @@ -454,32 +452,9 @@ public Automaton toAutomaton(AutomatonProvider automaton_provider) */ public Automaton toAutomaton(Map automata) throws IllegalArgumentException { - return toAutomatonAllowMutate(automata, null); - } - - /** - * Sets or resets allow mutate flag. If this flag is set, then automata - * construction uses mutable automata, which is slightly faster but not thread - * safe. By default, the flag is not set. - * - * @param flag if true, the flag is set - * @return previous value of the flag - */ - public boolean setAllowMutate(boolean flag) { - boolean b = allow_mutation; - allow_mutation = flag; - return b; - } - - private Automaton toAutomatonAllowMutate(Map automata, - AutomatonProvider automaton_provider) throws IllegalArgumentException { - boolean b = false; - if (allow_mutation) b = Automaton.setAllowMutate(true); // thread unsafe - Automaton a = toAutomaton(automata, automaton_provider); - if (allow_mutation) Automaton.setAllowMutate(b); - return a; + return toAutomaton(automata, null); } - + private Automaton toAutomaton(Map automata, AutomatonProvider automaton_provider) throws IllegalArgumentException { List list; @@ -489,8 +464,8 @@ private Automaton toAutomaton(Map automata, list = new ArrayList<>(); findLeaves(exp1, Kind.REGEXP_UNION, list, automata, automaton_provider); findLeaves(exp2, Kind.REGEXP_UNION, list, automata, automaton_provider); - a = BasicOperations.union(list); - MinimizationOperations.minimize(a); + a = Operations.union(list); + a = MinimizationOperations.minimize(a); break; case REGEXP_CONCATENATION: list = new ArrayList<>(); @@ -498,66 +473,72 @@ private Automaton toAutomaton(Map automata, automaton_provider); findLeaves(exp2, Kind.REGEXP_CONCATENATION, list, automata, automaton_provider); - a = BasicOperations.concatenate(list); - MinimizationOperations.minimize(a); + a = Operations.concatenate(list); + a = MinimizationOperations.minimize(a); break; case REGEXP_INTERSECTION: - a = exp1.toAutomaton(automata, automaton_provider).intersection( + a = Operations.intersection( + exp1.toAutomaton(automata, automaton_provider), exp2.toAutomaton(automata, automaton_provider)); - MinimizationOperations.minimize(a); + a = MinimizationOperations.minimize(a); break; case REGEXP_OPTIONAL: - a = exp1.toAutomaton(automata, automaton_provider).optional(); - MinimizationOperations.minimize(a); + a = Operations.optional(exp1.toAutomaton(automata, automaton_provider)); + a = MinimizationOperations.minimize(a); break; case REGEXP_REPEAT: - a = exp1.toAutomaton(automata, automaton_provider).repeat(); - MinimizationOperations.minimize(a); + a = Operations.repeat(exp1.toAutomaton(automata, automaton_provider)); + a = MinimizationOperations.minimize(a); break; case REGEXP_REPEAT_MIN: - a = exp1.toAutomaton(automata, automaton_provider).repeat(min); - MinimizationOperations.minimize(a); + a = Operations.repeat(exp1.toAutomaton(automata, automaton_provider), min); + a = MinimizationOperations.minimize(a); break; case REGEXP_REPEAT_MINMAX: - a = exp1.toAutomaton(automata, automaton_provider).repeat(min, max); - MinimizationOperations.minimize(a); + a = Operations.repeat(exp1.toAutomaton(automata, automaton_provider), min, max); + a = MinimizationOperations.minimize(a); break; case REGEXP_COMPLEMENT: - a = exp1.toAutomaton(automata, automaton_provider).complement(); - MinimizationOperations.minimize(a); + a = Operations.complement(exp1.toAutomaton(automata, automaton_provider)); + a = MinimizationOperations.minimize(a); break; case REGEXP_CHAR: - a = BasicAutomata.makeChar(c); + a = Automata.makeChar(c); break; case REGEXP_CHAR_RANGE: - a = BasicAutomata.makeCharRange(from, to); + a = Automata.makeCharRange(from, to); break; case REGEXP_ANYCHAR: - a = BasicAutomata.makeAnyChar(); + a = Automata.makeAnyChar(); break; case REGEXP_EMPTY: - a = BasicAutomata.makeEmpty(); + a = Automata.makeEmpty(); break; case REGEXP_STRING: - a = BasicAutomata.makeString(s); + a = Automata.makeString(s); break; case REGEXP_ANYSTRING: - a = BasicAutomata.makeAnyString(); + a = Automata.makeAnyString(); break; case REGEXP_AUTOMATON: Automaton aa = null; - if (automata != null) aa = automata.get(s); - if (aa == null && automaton_provider != null) try { - aa = automaton_provider.getAutomaton(s); - } catch (IOException e) { - throw new IllegalArgumentException(e); + if (automata != null) { + aa = automata.get(s); } - if (aa == null) throw new IllegalArgumentException("'" + s - + "' not found"); - a = aa.clone(); // always clone here (ignore allow_mutate) + if (aa == null && automaton_provider != null) { + try { + aa = automaton_provider.getAutomaton(s); + } catch (IOException e) { + throw new IllegalArgumentException(e); + } + } + if (aa == null) { + throw new IllegalArgumentException("'" + s + "' not found"); + } + a = aa; break; case REGEXP_INTERVAL: - a = BasicAutomata.makeInterval(min, max, digits); + a = Automata.makeInterval(min, max, digits); break; } return a; @@ -568,7 +549,9 @@ private void findLeaves(RegExp exp, Kind kind, List list, if (exp.kind == kind) { findLeaves(exp.exp1, kind, list, automata, automaton_provider); findLeaves(exp.exp2, kind, list, automata, automaton_provider); - } else list.add(exp.toAutomaton(automata, automaton_provider)); + } else { + list.add(exp.toAutomaton(automata, automaton_provider)); + } } /** diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java index bbcadd3374a3..7c216321d4f3 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java @@ -37,6 +37,7 @@ * @lucene.experimental */ public abstract class RunAutomaton { + final Automaton automaton; final int maxInterval; final int size; final boolean[] accept; @@ -65,10 +66,10 @@ public String toString() { if (j + 1 < points.length) max = (points[j + 1] - 1); else max = maxInterval; b.append(" "); - Transition.appendCharString(min, b); + Automaton.appendCharString(min, b); if (min != max) { b.append("-"); - Transition.appendCharString(max, b); + Automaton.appendCharString(max, b); } b.append(" -> ").append(k).append("\n"); } @@ -110,7 +111,7 @@ public final int[] getCharIntervals() { * Gets character class of given codepoint */ final int getCharClass(int c) { - return SpecialOperations.findIndex(c, points); + return Operations.findIndex(c, points); } /** @@ -121,23 +122,23 @@ final int getCharClass(int c) { */ public RunAutomaton(Automaton a, int maxInterval, boolean tableize) { this.maxInterval = maxInterval; - a.determinize(); + a = Operations.determinize(a); + this.automaton = a; points = a.getStartPoints(); - final State[] states = a.getNumberedStates(); - initial = a.initial.number; - size = states.length; + initial = 0; + size = Math.max(1,a.getNumStates()); accept = new boolean[size]; transitions = new int[size * points.length]; - for (int n = 0; n < size * points.length; n++) - transitions[n] = -1; - for (State s : states) { - int n = s.number; - accept[n] = s.accept; + Arrays.fill(transitions, -1); + for (int n=0;n 1) { - int d = (a + b) >>> 1; - if (points[d] > c) b = d; - else if (points[d] < c) a = d; - else return d; - } - return a; - } - - /** - * Returns true if the language of this automaton is finite. - */ - public static boolean isFinite(Automaton a) { - if (a.isSingleton()) return true; - return isFinite(a.initial, new BitSet(a.getNumberOfStates()), new BitSet(a.getNumberOfStates())); - } - - /** - * Checks whether there is a loop containing s. (This is sufficient since - * there are never transitions to dead states.) - */ - // TODO: not great that this is recursive... in theory a - // large automata could exceed java's stack - private static boolean isFinite(State s, BitSet path, BitSet visited) { - path.set(s.number); - for (Transition t : s.getTransitions()) - if (path.get(t.to.number) || (!visited.get(t.to.number) && !isFinite(t.to, path, visited))) return false; - path.clear(s.number); - visited.set(s.number); - return true; - } - - /** - * Returns the longest string that is a prefix of all accepted strings and - * visits each state at most once. - * - * @return common prefix - */ - public static String getCommonPrefix(Automaton a) { - if (a.isSingleton()) return a.singleton; - StringBuilder b = new StringBuilder(); - HashSet visited = new HashSet<>(); - State s = a.initial; - boolean done; - do { - done = true; - visited.add(s); - if (!s.accept && s.numTransitions() == 1) { - Transition t = s.getTransitions().iterator().next(); - if (t.min == t.max && !visited.contains(t.to)) { - b.appendCodePoint(t.min); - s = t.to; - done = false; - } - } - } while (!done); - return b.toString(); - } - - // TODO: this currently requites a determinized machine, - // but it need not -- we can speed it up by walking the - // NFA instead. it'd still be fail fast. - public static BytesRef getCommonPrefixBytesRef(Automaton a) { - if (a.isSingleton()) return new BytesRef(a.singleton); - BytesRef ref = new BytesRef(10); - HashSet visited = new HashSet<>(); - State s = a.initial; - boolean done; - do { - done = true; - visited.add(s); - if (!s.accept && s.numTransitions() == 1) { - Transition t = s.getTransitions().iterator().next(); - if (t.min == t.max && !visited.contains(t.to)) { - ref.grow(++ref.length); - ref.bytes[ref.length - 1] = (byte)t.min; - s = t.to; - done = false; - } - } - } while (!done); - return ref; - } - - /** - * Returns the longest string that is a suffix of all accepted strings and - * visits each state at most once. - * - * @return common suffix - */ - public static String getCommonSuffix(Automaton a) { - if (a.isSingleton()) // if singleton, the suffix is the string itself. - return a.singleton; - - // reverse the language of the automaton, then reverse its common prefix. - Automaton r = a.clone(); - reverse(r); - r.determinize(); - return new StringBuilder(SpecialOperations.getCommonPrefix(r)).reverse().toString(); - } - - public static BytesRef getCommonSuffixBytesRef(Automaton a) { - if (a.isSingleton()) // if singleton, the suffix is the string itself. - return new BytesRef(a.singleton); - - // reverse the language of the automaton, then reverse its common prefix. - Automaton r = a.clone(); - reverse(r); - r.determinize(); - BytesRef ref = SpecialOperations.getCommonPrefixBytesRef(r); - reverseBytes(ref); - return ref; - } - - private static void reverseBytes(BytesRef ref) { - if (ref.length <= 1) return; - int num = ref.length >> 1; - for (int i = ref.offset; i < ( ref.offset + num ); i++) { - byte b = ref.bytes[i]; - ref.bytes[i] = ref.bytes[ref.offset * 2 + ref.length - i - 1]; - ref.bytes[ref.offset * 2 + ref.length - i - 1] = b; - } - } - - /** - * Reverses the language of the given (non-singleton) automaton while returning - * the set of new initial states. - */ - public static Set reverse(Automaton a) { - a.expandSingleton(); - // reverse all edges - HashMap> m = new HashMap<>(); - State[] states = a.getNumberedStates(); - Set accept = new HashSet<>(); - for (State s : states) - if (s.isAccept()) - accept.add(s); - for (State r : states) { - m.put(r, new HashSet()); - r.accept = false; - } - for (State r : states) - for (Transition t : r.getTransitions()) - m.get(t.to).add(new Transition(t.min, t.max, r)); - for (State r : states) { - Set tr = m.get(r); - r.setTransitions(tr.toArray(new Transition[tr.size()])); - } - // make new initial+final states - a.initial.accept = true; - a.initial = new State(); - for (State r : accept) - a.initial.addEpsilon(r); // ensures that all initial states are reachable - a.deterministic = false; - a.clearNumberedStates(); - return accept; - } - - private static class PathNode { - - /** Which state the path node ends on, whose - * transitions we are enumerating. */ - public State state; - - /** Which state the current transition leads to. */ - public State to; - - /** Which transition we are on. */ - public int transition; - - /** Which label we are on, in the min-max range of the - * current Transition */ - public int label; - - public void resetState(State state) { - assert state.numTransitions() != 0; - this.state = state; - transition = 0; - Transition t = state.transitionsArray[transition]; - label = t.min; - to = t.to; - } - - /** Returns next label of current transition, or - * advances to next transition and returns its first - * label, if current one is exhausted. If there are - * no more transitions, returns -1. */ - public int nextLabel() { - if (label > state.transitionsArray[transition].max) { - // We've exhaused the current transition's labels; - // move to next transitions: - transition++; - if (transition >= state.numTransitions()) { - // We're done iterating transitions leaving this state - return -1; - } - Transition t = state.transitionsArray[transition]; - label = t.min; - to = t.to; - } - return label++; - } - } - - private static PathNode getNode(PathNode[] nodes, int index) { - assert index < nodes.length; - if (nodes[index] == null) { - nodes[index] = new PathNode(); - } - return nodes[index]; - } - - // TODO: this is a dangerous method ... Automaton could be - // huge ... and it's better in general for caller to - // enumerate & process in a single walk: - - /** Returns the set of accepted strings, up to at most - * limit strings. If more than limit - * strings are accepted, the first limit strings found are returned. If limit == -1, then - * the limit is infinite. If the {@link Automaton} has - * cycles then this method might throw {@code - * IllegalArgumentException} but that is not guaranteed - * when the limit is set. */ - public static Set getFiniteStrings(Automaton a, int limit) { - Set results = new HashSet<>(); - - if (limit == -1 || limit > 0) { - // OK - } else { - throw new IllegalArgumentException("limit must be -1 (which means no limit), or > 0; got: " + limit); - } - - if (a.isSingleton()) { - // Easy case: automaton accepts only 1 string - results.add(Util.toUTF32(a.singleton, new IntsRef())); - } else { - - if (a.initial.accept) { - // Special case the empty string, as usual: - results.add(new IntsRef()); - } - - if (a.initial.numTransitions() > 0 && (limit == -1 || results.size() < limit)) { - - // TODO: we could use state numbers here and just - // alloc array, but asking for states array can be - // costly (it's lazily computed): - - // Tracks which states are in the current path, for - // cycle detection: - Set pathStates = Collections.newSetFromMap(new IdentityHashMap()); - - // Stack to hold our current state in the - // recursion/iteration: - PathNode[] nodes = new PathNode[4]; - - pathStates.add(a.initial); - PathNode root = getNode(nodes, 0); - root.resetState(a.initial); - - IntsRef string = new IntsRef(1); - string.length = 1; - - while (string.length > 0) { - - PathNode node = nodes[string.length-1]; - - // Get next label leaving the current node: - int label = node.nextLabel(); - - if (label != -1) { - string.ints[string.length-1] = label; - - if (node.to.accept) { - // This transition leads to an accept state, - // so we save the current string: - results.add(IntsRef.deepCopyOf(string)); - if (results.size() == limit) { - break; - } - } - - if (node.to.numTransitions() != 0) { - // Now recurse: the destination of this transition has - // outgoing transitions: - if (pathStates.contains(node.to)) { - throw new IllegalArgumentException("automaton has cycles"); - } - pathStates.add(node.to); - - // Push node onto stack: - if (nodes.length == string.length) { - PathNode[] newNodes = new PathNode[ArrayUtil.oversize(nodes.length+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; - System.arraycopy(nodes, 0, newNodes, 0, nodes.length); - nodes = newNodes; - } - getNode(nodes, string.length).resetState(node.to); - string.length++; - string.grow(string.length); - } - } else { - // No more transitions leaving this state, - // pop/return back to previous state: - assert pathStates.contains(node.state); - pathStates.remove(node.state); - string.length--; - } - } - } - } - - return results; - } -} diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/State.java b/lucene/core/src/java/org/apache/lucene/util/automaton/State.java deleted file mode 100644 index d1639e435778..000000000000 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/State.java +++ /dev/null @@ -1,280 +0,0 @@ -/* - * dk.brics.automaton - * - * Copyright (c) 2001-2009 Anders Moeller - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -package org.apache.lucene.util.automaton; -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.RamUsageEstimator; - -import java.util.Collection; -import java.util.Comparator; -import java.util.Iterator; - -/** - * Automaton state. - * - * @lucene.experimental - */ -public class State implements Comparable { - - boolean accept; - public Transition[] transitionsArray; - public int numTransitions; - - int number; - - int id; - static int next_id; - - /** - * Constructs a new state. Initially, the new state is a reject state. - */ - public State() { - resetTransitions(); - id = next_id++; - } - - /** - * Resets transition set. - */ - final void resetTransitions() { - transitionsArray = new Transition[0]; - numTransitions = 0; - } - - private class TransitionsIterable implements Iterable { - @Override - public Iterator iterator() { - return new Iterator() { - int upto; - @Override - public boolean hasNext() { - return upto < numTransitions; - } - @Override - public Transition next() { - return transitionsArray[upto++]; - } - @Override - public void remove() { - throw new UnsupportedOperationException(); - } - }; - } - } - - /** - * Returns the set of outgoing transitions. Subsequent changes are reflected - * in the automaton. - * - * @return transition set - */ - public Iterable getTransitions() { - return new TransitionsIterable(); - } - - public int numTransitions() { - return numTransitions; - } - - public void setTransitions(Transition[] transitions) { - this.numTransitions = transitions.length; - this.transitionsArray = transitions; - } - - /** - * Adds an outgoing transition. - * - * @param t transition - */ - public void addTransition(Transition t) { - if (numTransitions == transitionsArray.length) { - final Transition[] newArray = new Transition[ArrayUtil.oversize(1+numTransitions, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; - System.arraycopy(transitionsArray, 0, newArray, 0, numTransitions); - transitionsArray = newArray; - } - transitionsArray[numTransitions++] = t; - } - - /** - * Sets acceptance for this state. - * - * @param accept if true, this state is an accept state - */ - public void setAccept(boolean accept) { - this.accept = accept; - } - - /** - * Returns acceptance status. - * - * @return true is this is an accept state - */ - public boolean isAccept() { - return accept; - } - - /** - * Performs lookup in transitions, assuming determinism. - * - * @param c codepoint to look up - * @return destination state, null if no matching outgoing transition - * @see #step(int, Collection) - */ - public State step(int c) { - assert c >= 0; - for (int i=0;i dest) { - for (int i=0;i max) max = t.max; - } else { - if (p != null) { - transitionsArray[upto++] = new Transition(min, max, p); - } - min = t.min; - max = t.max; - } - } else { - if (p != null) { - transitionsArray[upto++] = new Transition(min, max, p); - } - p = t.to; - min = t.min; - max = t.max; - } - } - - if (p != null) { - transitionsArray[upto++] = new Transition(min, max, p); - } - numTransitions = upto; - } - - /** - * Returns sorted list of outgoing transitions. - * - * @param to_first if true, order by (to, min, reverse max); otherwise (min, - * reverse max, to) - * @return transition list - */ - - /** Sorts transitions array in-place. */ - public void sortTransitions(Comparator comparator) { - // mergesort seems to perform better on already sorted arrays: - if (numTransitions > 1) ArrayUtil.timSort(transitionsArray, 0, numTransitions, comparator); - } - - /** - * Return this state's number. - *

- * Expert: Will be useless unless {@link Automaton#getNumberedStates} - * has been called first to number the states. - * @return the number - */ - public int getNumber() { - return number; - } - - /** - * Returns string describing this state. Normally invoked via - * {@link Automaton#toString()}. - */ - @Override - public String toString() { - StringBuilder b = new StringBuilder(); - b.append("state ").append(number); - if (accept) b.append(" [accept]"); - else b.append(" [reject]"); - b.append(":\n"); - for (Transition t : getTransitions()) - b.append(" ").append(t.toString()).append("\n"); - return b.toString(); - } - - /** - * Compares this object with the specified object for order. States are - * ordered by the time of construction. - */ - @Override - public int compareTo(State s) { - return s.id - id; - } -} diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/StatePair.java b/lucene/core/src/java/org/apache/lucene/util/automaton/StatePair.java index 4124b9e8b930..4ce81ab35a9f 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/StatePair.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/StatePair.java @@ -35,11 +35,11 @@ * @lucene.experimental */ public class StatePair { - State s; - State s1; - State s2; + int s; + int s1; + int s2; - StatePair(State s, State s1, State s2) { + StatePair(int s, int s1, int s2) { this.s = s; this.s1 = s1; this.s2 = s2; @@ -51,27 +51,10 @@ public class StatePair { * @param s1 first state * @param s2 second state */ - public StatePair(State s1, State s2) { + public StatePair(int s1, int s2) { this.s1 = s1; this.s2 = s2; - } - - /** - * Returns first component of this pair. - * - * @return first state - */ - public State getFirstState() { - return s1; - } - - /** - * Returns second component of this pair. - * - * @return second state - */ - public State getSecondState() { - return s2; + this.s = -1; } /** @@ -96,6 +79,11 @@ public boolean equals(Object obj) { */ @Override public int hashCode() { - return s1.hashCode() + s2.hashCode(); + return s1 ^ s2; + } + + @Override + public String toString() { + return "StatePair(s1=" + s1 + " s2=" + s2 + ")"; } } diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Transition.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Transition.java index d22c6dbace74..fc5b6589a9bc 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Transition.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Transition.java @@ -1,214 +1,51 @@ +package org.apache.lucene.util.automaton; + /* - * dk.brics.automaton - * - * Copyright (c) 2001-2009 Anders Moeller - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ -package org.apache.lucene.util.automaton; +/** Holds one transition from an {@link Automaton}. This is typically + * used temporarily when iterating through transitions by invoking + * {@link Automaton#initTransition} and {@link Automaton#getNextTransition}. */ -import java.util.Comparator; +public class Transition { -/** - * Automaton transition. - *

- * A transition, which belongs to a source state, consists of a Unicode - * codepoint interval and a destination state. - * - * @lucene.experimental - */ -public class Transition implements Cloneable { - - /* - * CLASS INVARIANT: min<=max - */ - - final int min; - final int max; - final State to; - - /** - * Constructs a new singleton interval transition. - * - * @param c transition codepoint - * @param to destination state - */ - public Transition(int c, State to) { - assert c >= 0; - min = max = c; - this.to = to; - } - - /** - * Constructs a new transition. Both end points are included in the interval. - * - * @param min transition interval minimum - * @param max transition interval maximum - * @param to destination state - */ - public Transition(int min, int max, State to) { - assert min >= 0; - assert max >= 0; - if (max < min) { - int t = max; - max = min; - min = t; - } - this.min = min; - this.max = max; - this.to = to; - } - - /** Returns minimum of this transition interval. */ - public int getMin() { - return min; - } - - /** Returns maximum of this transition interval. */ - public int getMax() { - return max; - } - - /** Returns destination of this transition. */ - public State getDest() { - return to; - } - - /** - * Checks for equality. - * - * @param obj object to compare with - * @return true if obj is a transition with same character interval - * and destination state as this transition. - */ - @Override - public boolean equals(Object obj) { - if (obj instanceof Transition) { - Transition t = (Transition) obj; - return t.min == min && t.max == max && t.to == to; - } else return false; - } - - /** - * Returns hash code. The hash code is based on the character interval (not - * the destination state). - * - * @return hash code - */ - @Override - public int hashCode() { - return min * 2 + max * 3; - } - - /** - * Clones this transition. - * - * @return clone with same character interval and destination state - */ - @Override - public Transition clone() { - try { - return (Transition) super.clone(); - } catch (CloneNotSupportedException e) { - throw new RuntimeException(e); - } - } - - static void appendCharString(int c, StringBuilder b) { - if (c >= 0x21 && c <= 0x7e && c != '\\' && c != '"') b.appendCodePoint(c); - else { - b.append("\\\\U"); - String s = Integer.toHexString(c); - if (c < 0x10) b.append("0000000").append(s); - else if (c < 0x100) b.append("000000").append(s); - else if (c < 0x1000) b.append("00000").append(s); - else if (c < 0x10000) b.append("0000").append(s); - else if (c < 0x100000) b.append("000").append(s); - else if (c < 0x1000000) b.append("00").append(s); - else if (c < 0x10000000) b.append("0").append(s); - else b.append(s); - } - } - - /** - * Returns a string describing this state. Normally invoked via - * {@link Automaton#toString()}. - */ - @Override - public String toString() { - StringBuilder b = new StringBuilder(); - appendCharString(min, b); - if (min != max) { - b.append("-"); - appendCharString(max, b); - } - b.append(" -> ").append(to.number); - return b.toString(); - } - - void appendDot(StringBuilder b) { - b.append(" -> ").append(to.number).append(" [label=\""); - appendCharString(min, b); - if (min != max) { - b.append("-"); - appendCharString(max, b); - } - b.append("\"]\n"); + /** Sole constructor. */ + public Transition() { } - private static final class CompareByDestThenMinMaxSingle implements Comparator { - @Override - public int compare(Transition t1, Transition t2) { - if (t1.to != t2.to) { - if (t1.to.number < t2.to.number) return -1; - else if (t1.to.number > t2.to.number) return 1; - } - if (t1.min < t2.min) return -1; - if (t1.min > t2.min) return 1; - if (t1.max > t2.max) return -1; - if (t1.max < t2.max) return 1; - return 0; - } - } + /** Source state. */ + public int source; - public static final Comparator CompareByDestThenMinMax = new CompareByDestThenMinMaxSingle(); + /** Destination state. */ + public int dest; - private static final class CompareByMinMaxThenDestSingle implements Comparator { - @Override - public int compare(Transition t1, Transition t2) { - if (t1.min < t2.min) return -1; - if (t1.min > t2.min) return 1; - if (t1.max > t2.max) return -1; - if (t1.max < t2.max) return 1; - if (t1.to != t2.to) { - if (t1.to.number < t2.to.number) return -1; - if (t1.to.number > t2.to.number) return 1; - } - return 0; - } - } + /** Minimum accepted label (inclusive). */ + public int min; + + /** Maximum accepted label (inclusive). */ + public int max; - public static final Comparator CompareByMinMaxThenDest = new CompareByMinMaxThenDestSingle(); + /** Remembers where we are in the iteration; init to -1 to provoke + * exception if nextTransition is called without first initTransition. */ + int transitionUpto = -1; + + @Override + public String toString() { + return source + " --> " + dest + " " + (char) min + "-" + (char) max; + } } + diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java b/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java index 17be0ec152b2..059ee09b4bb3 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java @@ -17,11 +17,9 @@ * limitations under the License. */ -import org.apache.lucene.util.RamUsageEstimator; -import org.apache.lucene.util.ArrayUtil; - -import java.util.List; +import java.util.Arrays; import java.util.ArrayList; +import java.util.List; // TODO // - do we really need the .bits...? if not we can make util in UnicodeUtil to convert 1 char into a BytesRef @@ -122,6 +120,10 @@ public String toString() { } } + /** Sole constructor. */ + public UTF32ToUTF8() { + } + private final UTF8Sequence startUTF8 = new UTF8Sequence(); private final UTF8Sequence endUTF8 = new UTF8Sequence(); @@ -129,37 +131,37 @@ public String toString() { private final UTF8Sequence tmpUTF8b = new UTF8Sequence(); // Builds necessary utf8 edges between start & end - void convertOneEdge(State start, State end, int startCodePoint, int endCodePoint) { + void convertOneEdge(int start, int end, int startCodePoint, int endCodePoint) { startUTF8.set(startCodePoint); endUTF8.set(endCodePoint); - //System.out.println("start = " + startUTF8); - //System.out.println(" end = " + endUTF8); build(start, end, startUTF8, endUTF8, 0); } - private void build(State start, State end, UTF8Sequence startUTF8, UTF8Sequence endUTF8, int upto) { + private void build(int start, int end, UTF8Sequence startUTF8, UTF8Sequence endUTF8, int upto) { // Break into start, middle, end: if (startUTF8.byteAt(upto) == endUTF8.byteAt(upto)) { // Degen case: lead with the same byte: if (upto == startUTF8.len-1 && upto == endUTF8.len-1) { // Super degen: just single edge, one UTF8 byte: - start.addTransition(new Transition(startUTF8.byteAt(upto), endUTF8.byteAt(upto), end)); + utf8.addTransition(start, end, startUTF8.byteAt(upto), endUTF8.byteAt(upto)); return; } else { assert startUTF8.len > upto+1; assert endUTF8.len > upto+1; - State n = newUTF8State(); + int n = utf8.createState(); // Single value leading edge - start.addTransition(new Transition(startUTF8.byteAt(upto), n)); // type=single + utf8.addTransition(start, n, startUTF8.byteAt(upto)); + //start.addTransition(new Transition(startUTF8.byteAt(upto), n)); // type=single // Recurse for the rest build(n, end, startUTF8, endUTF8, 1+upto); } } else if (startUTF8.len == endUTF8.len) { if (upto == startUTF8.len-1) { - start.addTransition(new Transition(startUTF8.byteAt(upto), endUTF8.byteAt(upto), end)); // type=startend + //start.addTransition(new Transition(startUTF8.byteAt(upto), endUTF8.byteAt(upto), end)); // type=startend + utf8.addTransition(start, end, startUTF8.byteAt(upto), endUTF8.byteAt(upto)); } else { start(start, end, startUTF8, upto, false); if (endUTF8.byteAt(upto) - startUTF8.byteAt(upto) > 1) { @@ -193,62 +195,69 @@ private void build(State start, State end, UTF8Sequence startUTF8, UTF8Sequence } } - private void start(State start, State end, UTF8Sequence utf8, int upto, boolean doAll) { - if (upto == utf8.len-1) { + private void start(int start, int end, UTF8Sequence startUTF8, int upto, boolean doAll) { + if (upto == startUTF8.len-1) { // Done recursing - start.addTransition(new Transition(utf8.byteAt(upto), utf8.byteAt(upto) | MASKS[utf8.numBits(upto)-1], end)); // type=start + utf8.addTransition(start, end, startUTF8.byteAt(upto), startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1]); // type=start + //start.addTransition(new Transition(startUTF8.byteAt(upto), startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1], end)); // type=start } else { - State n = newUTF8State(); - start.addTransition(new Transition(utf8.byteAt(upto), n)); // type=start - start(n, end, utf8, 1+upto, true); - int endCode = utf8.byteAt(upto) | MASKS[utf8.numBits(upto)-1]; - if (doAll && utf8.byteAt(upto) != endCode) { - all(start, end, utf8.byteAt(upto)+1, endCode, utf8.len-upto-1); + int n = utf8.createState(); + utf8.addTransition(start, n, startUTF8.byteAt(upto)); + //start.addTransition(new Transition(startUTF8.byteAt(upto), n)); // type=start + start(n, end, startUTF8, 1+upto, true); + int endCode = startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1]; + if (doAll && startUTF8.byteAt(upto) != endCode) { + all(start, end, startUTF8.byteAt(upto)+1, endCode, startUTF8.len-upto-1); } } } - private void end(State start, State end, UTF8Sequence utf8, int upto, boolean doAll) { - if (upto == utf8.len-1) { + private void end(int start, int end, UTF8Sequence endUTF8, int upto, boolean doAll) { + if (upto == endUTF8.len-1) { // Done recursing - start.addTransition(new Transition(utf8.byteAt(upto) & (~MASKS[utf8.numBits(upto)-1]), utf8.byteAt(upto), end)); // type=end + //start.addTransition(new Transition(endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]), endUTF8.byteAt(upto), end)); // type=end + utf8.addTransition(start, end, endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]), endUTF8.byteAt(upto)); } else { final int startCode; - if (utf8.numBits(upto) == 5) { - // special case -- avoid created unused edges (utf8 + if (endUTF8.numBits(upto) == 5) { + // special case -- avoid created unused edges (endUTF8 // doesn't accept certain byte sequences) -- there // are other cases we could optimize too: startCode = 194; } else { - startCode = utf8.byteAt(upto) & (~MASKS[utf8.numBits(upto)-1]); + startCode = endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]); } - if (doAll && utf8.byteAt(upto) != startCode) { - all(start, end, startCode, utf8.byteAt(upto)-1, utf8.len-upto-1); + if (doAll && endUTF8.byteAt(upto) != startCode) { + all(start, end, startCode, endUTF8.byteAt(upto)-1, endUTF8.len-upto-1); } - State n = newUTF8State(); - start.addTransition(new Transition(utf8.byteAt(upto), n)); // type=end - end(n, end, utf8, 1+upto, true); + int n = utf8.createState(); + //start.addTransition(new Transition(endUTF8.byteAt(upto), n)); // type=end + utf8.addTransition(start, n, endUTF8.byteAt(upto)); + end(n, end, endUTF8, 1+upto, true); } } - private void all(State start, State end, int startCode, int endCode, int left) { + private void all(int start, int end, int startCode, int endCode, int left) { if (left == 0) { - start.addTransition(new Transition(startCode, endCode, end)); // type=all + //start.addTransition(new Transition(startCode, endCode, end)); // type=all + utf8.addTransition(start, end, startCode, endCode); } else { - State lastN = newUTF8State(); - start.addTransition(new Transition(startCode, endCode, lastN)); // type=all + int lastN = utf8.createState(); + //start.addTransition(new Transition(startCode, endCode, lastN)); // type=all + utf8.addTransition(start, lastN, startCode, endCode); while (left > 1) { - State n = newUTF8State(); - lastN.addTransition(new Transition(128, 191, n)); // type=all* + int n = utf8.createState(); + //lastN.addTransition(new Transition(128, 191, n)); // type=all* + utf8.addTransition(lastN, n, 128, 191); // type=all* left--; lastN = n; } - lastN.addTransition(new Transition(128, 191, end)); // type = all* + //lastN.addTransition(new Transition(128, 191, end)); // type = all* + utf8.addTransition(lastN, end, 128, 191); // type = all* } } - private State[] utf8States; - private int utf8StateCount; + Automaton.Builder utf8; /** Converts an incoming utf32 automaton to an equivalent * utf8 one. The incoming automaton need not be @@ -256,61 +265,49 @@ private void all(State start, State end, int startCode, int endCode, int left) { * not in general be deterministic, so you must * determinize it if that's needed. */ public Automaton convert(Automaton utf32) { - if (utf32.isSingleton()) { - utf32 = utf32.cloneExpanded(); + if (utf32.getNumStates() == 0) { + return utf32; } - State[] map = new State[utf32.getNumberedStates().length]; - List pending = new ArrayList<>(); - State utf32State = utf32.getInitialState(); - pending.add(utf32State); - Automaton utf8 = new Automaton(); - utf8.setDeterministic(false); - - State utf8State = utf8.getInitialState(); + int[] map = new int[utf32.getNumStates()]; + Arrays.fill(map, -1); - utf8States = new State[5]; - utf8StateCount = 0; - utf8State.number = utf8StateCount; - utf8States[utf8StateCount] = utf8State; - utf8StateCount++; + List pending = new ArrayList<>(); + int utf32State = 0; + pending.add(utf32State); + utf8 = new Automaton.Builder(); + + int utf8State = utf8.createState(); - utf8State.setAccept(utf32State.isAccept()); + utf8.setAccept(utf8State, utf32.isAccept(utf32State)); - map[utf32State.number] = utf8State; + map[utf32State] = utf8State; + + Transition scratch = new Transition(); - while(pending.size() != 0) { + while (pending.size() != 0) { utf32State = pending.remove(pending.size()-1); - utf8State = map[utf32State.number]; - for(int i=0;i as = new ArrayList<>(); for(String s : strings) { - as.add(BasicAutomata.makeString(s)); + as.add(s2a(s)); as.add(SEP_A); } as.remove(as.size()-1); - return BasicOperations.concatenate(as); + return Operations.concatenate(as); } private Automaton join(Automaton ... as) { - return BasicOperations.concatenate(Arrays.asList(as)); + return Operations.concatenate(Arrays.asList(as)); } private Automaton s2a(String s) { - return BasicAutomata.makeString(s); + return Automata.makeString(s); } public void testTwoTokens() throws Exception { @@ -482,7 +485,8 @@ public void testTwoTokens() throws Exception { final Automaton expected = join("abc", "def"); //toDot(actual); - assertTrue(BasicOperations.sameLanguage(expected, actual)); + assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)), + Operations.determinize(Operations.removeDeadStates(actual)))); } public void testHole() throws Exception { @@ -497,7 +501,8 @@ public void testHole() throws Exception { final Automaton expected = join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def")); //toDot(actual); - assertTrue(BasicOperations.sameLanguage(expected, actual)); + assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)), + Operations.determinize(Operations.removeDeadStates(actual)))); } public void testOverlappedTokensSausage() throws Exception { @@ -509,10 +514,11 @@ public void testOverlappedTokensSausage() throws Exception { token("xyz", 0, 1) }); final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); - final Automaton a1 = BasicAutomata.makeString("abc"); - final Automaton a2 = BasicAutomata.makeString("xyz"); - final Automaton expected = BasicOperations.union(a1, a2); - assertTrue(BasicOperations.sameLanguage(expected, actual)); + final Automaton a1 = s2a("abc"); + final Automaton a2 = s2a("xyz"); + final Automaton expected = Operations.union(a1, a2); + assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)), + Operations.determinize(Operations.removeDeadStates(actual)))); } public void testOverlappedTokensLattice() throws Exception { @@ -524,12 +530,13 @@ public void testOverlappedTokensLattice() throws Exception { token("def", 1, 1), }); final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); - final Automaton a1 = BasicAutomata.makeString("xyz"); + final Automaton a1 = s2a("xyz"); final Automaton a2 = join("abc", "def"); - final Automaton expected = BasicOperations.union(a1, a2); + final Automaton expected = Operations.union(a1, a2); //toDot(actual); - assertTrue(BasicOperations.sameLanguage(expected, actual)); + assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)), + Operations.determinize(Operations.removeDeadStates(actual)))); } public void testSynOverHole() throws Exception { @@ -541,13 +548,14 @@ public void testSynOverHole() throws Exception { token("b", 2, 1), }); final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); - final Automaton a1 = BasicOperations.union( + final Automaton a1 = Operations.union( join(s2a("a"), SEP_A, HOLE_A), - BasicAutomata.makeString("X")); - final Automaton expected = BasicOperations.concatenate(a1, + s2a("X")); + final Automaton expected = Operations.concatenate(a1, join(SEP_A, s2a("b"))); //toDot(actual); - assertTrue(BasicOperations.sameLanguage(expected, actual)); + assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)), + Operations.determinize(Operations.removeDeadStates(actual)))); } public void testSynOverHole2() throws Exception { @@ -559,10 +567,11 @@ public void testSynOverHole2() throws Exception { token("def", 2, 1), }); final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); - final Automaton expected = BasicOperations.union( + final Automaton expected = Operations.union( join(s2a("xyz"), SEP_A, HOLE_A, SEP_A, s2a("def")), - BasicAutomata.makeString("abc")); - assertTrue(BasicOperations.sameLanguage(expected, actual)); + s2a("abc")); + assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)), + Operations.determinize(Operations.removeDeadStates(actual)))); } public void testOverlappedTokensLattice2() throws Exception { @@ -575,11 +584,12 @@ public void testOverlappedTokensLattice2() throws Exception { token("ghi", 1, 1), }); final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); - final Automaton a1 = BasicAutomata.makeString("xyz"); + final Automaton a1 = s2a("xyz"); final Automaton a2 = join("abc", "def", "ghi"); - final Automaton expected = BasicOperations.union(a1, a2); + final Automaton expected = Operations.union(a1, a2); //toDot(actual); - assertTrue(BasicOperations.sameLanguage(expected, actual)); + assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)), + Operations.determinize(Operations.removeDeadStates(actual)))); } public void testToDot() throws Exception { @@ -597,7 +607,8 @@ public void testStartsWithHole() throws Exception { final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton expected = join(HOLE_A, SEP_A, s2a("abc")); //toDot(actual); - assertTrue(BasicOperations.sameLanguage(expected, actual)); + assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)), + Operations.determinize(Operations.removeDeadStates(actual)))); } // TODO: testEndsWithHole... but we need posInc to set in TS.end() @@ -609,8 +620,9 @@ public void testSynHangingOverEnd() throws Exception { token("X", 0, 10), }); final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); - final Automaton expected = BasicOperations.union(BasicAutomata.makeString("a"), - BasicAutomata.makeString("X")); - assertTrue(BasicOperations.sameLanguage(expected, actual)); + final Automaton expected = Operations.union(s2a("a"), + s2a("X")); + assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)), + Operations.determinize(Operations.removeDeadStates(actual)))); } } diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java b/lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java index 18d5f8083c64..9d68ea864e36 100644 --- a/lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java +++ b/lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java @@ -34,10 +34,9 @@ import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.TestUtil; -import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.AutomatonTestUtil; -import org.apache.lucene.util.automaton.BasicAutomata; -import org.apache.lucene.util.automaton.BasicOperations; +import org.apache.lucene.util.automaton.Automata; +import org.apache.lucene.util.automaton.Operations; import org.apache.lucene.util.automaton.CharacterRunAutomaton; import org.apache.lucene.util.automaton.RegExp; @@ -165,9 +164,9 @@ public void testStop() throws Exception { public void testKeep() throws Exception { CharacterRunAutomaton keepWords = new CharacterRunAutomaton( - BasicOperations.complement( - Automaton.union( - Arrays.asList(BasicAutomata.makeString("foo"), BasicAutomata.makeString("bar"))))); + Operations.complement( + Operations.union( + Arrays.asList(Automata.makeString("foo"), Automata.makeString("bar"))))); Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, keepWords); assertAnalyzesTo(a, "quick foo brown bar bar fox foo", new String[] { "foo", "bar", "bar", "foo" }, diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java index 03881d91b9d0..533cc7222d70 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -84,9 +84,9 @@ import org.apache.lucene.util.TestUtil; import org.apache.lucene.util.ThreadInterruptedException; import org.apache.lucene.util.Version; -import org.apache.lucene.util.automaton.Automaton; -import org.apache.lucene.util.automaton.BasicAutomata; +import org.apache.lucene.util.automaton.Automata; import org.apache.lucene.util.automaton.CharacterRunAutomaton; +import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.packed.PackedInts; import org.junit.Test; @@ -2006,7 +2006,7 @@ protected TokenStreamComponents createComponents(String fieldName) { public void testStopwordsPosIncHole2() throws Exception { // use two stopfilters for testing here Directory dir = newDirectory(); - final Automaton secondSet = BasicAutomata.makeString("foobar"); + final Automaton secondSet = Automata.makeString("foobar"); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { diff --git a/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java b/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java index ecf4e9b05ca3..9acc4fd3b5ff 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java @@ -33,9 +33,9 @@ import org.apache.lucene.util.LuceneTestCase.SuppressCodecs; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.TestUtil; -import org.apache.lucene.util.automaton.Automaton; -import org.apache.lucene.util.automaton.BasicAutomata; +import org.apache.lucene.util.automaton.Automata; import org.apache.lucene.util.automaton.CompiledAutomaton; +import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.RegExp; @SuppressCodecs({ "SimpleText", "Memory", "Direct" }) @@ -244,7 +244,7 @@ public void testIntersectRandom() throws IOException { if (VERBOSE) { System.out.println("\nTEST: empty automaton"); } - a = BasicAutomata.makeEmpty(); + a = Automata.makeEmpty(); } else { if (VERBOSE) { System.out.println("\nTEST: keepPct=" + keepPct); @@ -259,16 +259,9 @@ public void testIntersectRandom() throws IOException { acceptTerms.add(s2); sortedAcceptTerms.add(new BytesRef(s2)); } - a = BasicAutomata.makeStringUnion(sortedAcceptTerms); + a = Automata.makeStringUnion(sortedAcceptTerms); } - if (random().nextBoolean()) { - if (VERBOSE) { - System.out.println("TEST: reduce the automaton"); - } - a.reduce(); - } - final CompiledAutomaton c = new CompiledAutomaton(a, true, false); final BytesRef[] acceptTermsArray = new BytesRef[acceptTerms.size()]; @@ -745,7 +738,7 @@ public void testIntersectBasic() throws Exception { w.shutdown(); AtomicReader sub = getOnlySegmentReader(r); Terms terms = sub.fields().terms("field"); - Automaton automaton = new RegExp(".*", RegExp.NONE).toAutomaton(); + Automaton automaton = new RegExp(".*", RegExp.NONE).toAutomaton(); CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false); TermsEnum te = terms.intersect(ca, null); assertEquals("aaa", te.next().utf8ToString()); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum2.java b/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum2.java index 55f8f0d7ce10..f261f16e3e7d 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum2.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum2.java @@ -31,6 +31,7 @@ import org.apache.lucene.search.AutomatonQuery; import org.apache.lucene.search.CheckHits; import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; @@ -68,7 +69,7 @@ public void setUp() throws Exception { writer.addDocument(doc); } - termsAutomaton = BasicAutomata.makeStringUnion(terms); + termsAutomaton = Automata.makeStringUnion(terms); reader = writer.getReader(); searcher = newSearcher(reader); @@ -84,23 +85,27 @@ public void tearDown() throws Exception { /** tests a pre-intersected automaton against the original */ public void testFiniteVersusInfinite() throws Exception { + for (int i = 0; i < numIterations; i++) { String reg = AutomatonTestUtil.randomRegexp(random()); - Automaton automaton = new RegExp(reg, RegExp.NONE).toAutomaton(); + Automaton automaton = Operations.determinize(new RegExp(reg, RegExp.NONE).toAutomaton()); final List matchedTerms = new ArrayList<>(); for(BytesRef t : terms) { - if (BasicOperations.run(automaton, t.utf8ToString())) { + if (Operations.run(automaton, t.utf8ToString())) { matchedTerms.add(t); } } - Automaton alternate = BasicAutomata.makeStringUnion(matchedTerms); + Automaton alternate = Automata.makeStringUnion(matchedTerms); //System.out.println("match " + matchedTerms.size() + " " + alternate.getNumberOfStates() + " states, sigma=" + alternate.getStartPoints().length); //AutomatonTestUtil.minimizeSimple(alternate); //System.out.println("minmize done"); AutomatonQuery a1 = new AutomatonQuery(new Term("field", ""), automaton); AutomatonQuery a2 = new AutomatonQuery(new Term("field", ""), alternate); - CheckHits.checkEqual(a1, searcher.search(a1, 25).scoreDocs, searcher.search(a2, 25).scoreDocs); + + ScoreDoc[] origHits = searcher.search(a1, 25).scoreDocs; + ScoreDoc[] newHits = searcher.search(a2, 25).scoreDocs; + CheckHits.checkEqual(a1, origHits, newHits); } } @@ -108,13 +113,13 @@ public void testFiniteVersusInfinite() throws Exception { public void testSeeking() throws Exception { for (int i = 0; i < numIterations; i++) { String reg = AutomatonTestUtil.randomRegexp(random()); - Automaton automaton = new RegExp(reg, RegExp.NONE).toAutomaton(); + Automaton automaton = Operations.determinize(new RegExp(reg, RegExp.NONE).toAutomaton()); TermsEnum te = MultiFields.getTerms(reader, "field").iterator(null); ArrayList unsortedTerms = new ArrayList<>(terms); Collections.shuffle(unsortedTerms, random()); for (BytesRef term : unsortedTerms) { - if (BasicOperations.run(automaton, term.utf8ToString())) { + if (Operations.run(automaton, term.utf8ToString())) { // term is accepted if (random().nextBoolean()) { // seek exact @@ -153,16 +158,16 @@ public void testIntersect() throws Exception { for (int i = 0; i < numIterations; i++) { String reg = AutomatonTestUtil.randomRegexp(random()); Automaton automaton = new RegExp(reg, RegExp.NONE).toAutomaton(); - CompiledAutomaton ca = new CompiledAutomaton(automaton, SpecialOperations.isFinite(automaton), false); + CompiledAutomaton ca = new CompiledAutomaton(automaton, Operations.isFinite(automaton), false); TermsEnum te = MultiFields.getTerms(reader, "field").intersect(ca, null); - Automaton expected = BasicOperations.intersection(termsAutomaton, automaton); + Automaton expected = Operations.determinize(Operations.intersection(termsAutomaton, automaton)); TreeSet found = new TreeSet<>(); while (te.next() != null) { found.add(BytesRef.deepCopyOf(te.term())); } - Automaton actual = BasicAutomata.makeStringUnion(found); - assertTrue(BasicOperations.sameLanguage(expected, actual)); + Automaton actual = Operations.determinize(Automata.makeStringUnion(found)); + assertTrue(Operations.sameLanguage(expected, actual)); } } } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java index 1887ed02767c..c1cc032797d9 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java @@ -33,10 +33,10 @@ import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.Rethrow; import org.apache.lucene.util.TestUtil; -import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.AutomatonTestUtil; -import org.apache.lucene.util.automaton.BasicAutomata; -import org.apache.lucene.util.automaton.BasicOperations; +import org.apache.lucene.util.automaton.Automata; +import org.apache.lucene.util.automaton.Operations; +import org.apache.lucene.util.automaton.Automaton; public class TestAutomatonQuery extends LuceneTestCase { private Directory directory; @@ -106,24 +106,24 @@ private void assertAutomatonHits(int expected, Automaton automaton) /** * Test some very simple automata. */ - public void testBasicAutomata() throws IOException { - assertAutomatonHits(0, BasicAutomata.makeEmpty()); - assertAutomatonHits(0, BasicAutomata.makeEmptyString()); - assertAutomatonHits(2, BasicAutomata.makeAnyChar()); - assertAutomatonHits(3, BasicAutomata.makeAnyString()); - assertAutomatonHits(2, BasicAutomata.makeString("doc")); - assertAutomatonHits(1, BasicAutomata.makeChar('a')); - assertAutomatonHits(2, BasicAutomata.makeCharRange('a', 'b')); - assertAutomatonHits(2, BasicAutomata.makeInterval(1233, 2346, 0)); - assertAutomatonHits(1, BasicAutomata.makeInterval(0, 2000, 0)); - assertAutomatonHits(2, BasicOperations.union(BasicAutomata.makeChar('a'), - BasicAutomata.makeChar('b'))); - assertAutomatonHits(0, BasicOperations.intersection(BasicAutomata - .makeChar('a'), BasicAutomata.makeChar('b'))); - assertAutomatonHits(1, BasicOperations.minus(BasicAutomata.makeCharRange('a', 'b'), - BasicAutomata.makeChar('a'))); + public void testAutomata() throws IOException { + assertAutomatonHits(0, Automata.makeEmpty()); + assertAutomatonHits(0, Automata.makeEmptyString()); + assertAutomatonHits(2, Automata.makeAnyChar()); + assertAutomatonHits(3, Automata.makeAnyString()); + assertAutomatonHits(2, Automata.makeString("doc")); + assertAutomatonHits(1, Automata.makeChar('a')); + assertAutomatonHits(2, Automata.makeCharRange('a', 'b')); + assertAutomatonHits(2, Automata.makeInterval(1233, 2346, 0)); + assertAutomatonHits(1, Automata.makeInterval(0, 2000, 0)); + assertAutomatonHits(2, Operations.union(Automata.makeChar('a'), + Automata.makeChar('b'))); + assertAutomatonHits(0, Operations.intersection(Automata + .makeChar('a'), Automata.makeChar('b'))); + assertAutomatonHits(1, Operations.minus(Automata.makeCharRange('a', 'b'), + Automata.makeChar('a'))); } - + /** * Test that a nondeterministic automaton works correctly. (It should will be * determinized) @@ -131,26 +131,27 @@ public void testBasicAutomata() throws IOException { public void testNFA() throws IOException { // accept this or three, the union is an NFA (two transitions for 't' from // initial state) - Automaton nfa = BasicOperations.union(BasicAutomata.makeString("this"), - BasicAutomata.makeString("three")); + Automaton nfa = Operations.union(Automata.makeString("this"), + Automata.makeString("three")); assertAutomatonHits(2, nfa); } public void testEquals() { - AutomatonQuery a1 = new AutomatonQuery(newTerm("foobar"), BasicAutomata + AutomatonQuery a1 = new AutomatonQuery(newTerm("foobar"), Automata .makeString("foobar")); // reference to a1 AutomatonQuery a2 = a1; // same as a1 (accepts the same language, same term) - AutomatonQuery a3 = new AutomatonQuery(newTerm("foobar"), BasicOperations - .concatenate(BasicAutomata.makeString("foo"), BasicAutomata - .makeString("bar"))); + AutomatonQuery a3 = new AutomatonQuery(newTerm("foobar"), + Operations.concatenate( + Automata.makeString("foo"), + Automata.makeString("bar"))); // different than a1 (same term, but different language) - AutomatonQuery a4 = new AutomatonQuery(newTerm("foobar"), BasicAutomata - .makeString("different")); + AutomatonQuery a4 = new AutomatonQuery(newTerm("foobar"), + Automata.makeString("different")); // different than a1 (different term, same language) - AutomatonQuery a5 = new AutomatonQuery(newTerm("blah"), BasicAutomata - .makeString("foobar")); + AutomatonQuery a5 = new AutomatonQuery(newTerm("blah"), + Automata.makeString("foobar")); assertEquals(a1.hashCode(), a2.hashCode()); assertEquals(a1, a2); @@ -176,8 +177,7 @@ public void testEquals() { * MultiTermQuery semantics. */ public void testRewriteSingleTerm() throws IOException { - AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"), BasicAutomata - .makeString("piece")); + AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"), Automata.makeString("piece")); Terms terms = MultiFields.getTerms(searcher.getIndexReader(), FN); assertTrue(aq.getTermsEnum(terms) instanceof SingleTermsEnum); assertEquals(1, automatonQueryNrHits(aq)); @@ -188,10 +188,8 @@ public void testRewriteSingleTerm() throws IOException { * MultiTermQuery semantics. */ public void testRewritePrefix() throws IOException { - Automaton pfx = BasicAutomata.makeString("do"); - pfx.expandSingleton(); // expand singleton representation for testing - Automaton prefixAutomaton = BasicOperations.concatenate(pfx, BasicAutomata - .makeAnyString()); + Automaton pfx = Automata.makeString("do"); + Automaton prefixAutomaton = Operations.concatenate(pfx, Automata.makeAnyString()); AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"), prefixAutomaton); Terms terms = MultiFields.getTerms(searcher.getIndexReader(), FN); assertTrue(aq.getTermsEnum(terms) instanceof PrefixTermsEnum); @@ -202,8 +200,7 @@ public void testRewritePrefix() throws IOException { * Test handling of the empty language */ public void testEmptyOptimization() throws IOException { - AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"), BasicAutomata - .makeEmpty()); + AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"), Automata.makeEmpty()); // not yet available: assertTrue(aq.getEnum(searcher.getIndexReader()) // instanceof EmptyTermEnum); Terms terms = MultiFields.getTerms(searcher.getIndexReader(), FN); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestDocTermOrdsRewriteMethod.java b/lucene/core/src/test/org/apache/lucene/search/TestDocTermOrdsRewriteMethod.java index d80d703668bb..9dd845fdf7fb 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestDocTermOrdsRewriteMethod.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestDocTermOrdsRewriteMethod.java @@ -78,7 +78,7 @@ public void setUp() throws Exception { Collections.sort(terms); System.out.println("UTF16 order:"); for(String s : terms) { - System.out.println(" " + UnicodeUtil.toHexString(s)); + System.out.println(" " + UnicodeUtil.toHexString(s) + " " + s); } } @@ -115,7 +115,7 @@ public void testRegexps() throws Exception { /** check that the # of hits is the same as if the query * is run against the inverted index */ - protected void assertSame(String regexp) throws IOException { + protected void assertSame(String regexp) throws IOException { RegexpQuery docValues = new RegexpQuery(new Term(fieldName, regexp), RegExp.NONE); docValues.setRewriteMethod(new DocTermOrdsRewriteMethod()); RegexpQuery inverted = new RegexpQuery(new Term(fieldName, regexp), RegExp.NONE); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java index b276b894c6ab..327a003ef35c 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java @@ -38,6 +38,21 @@ */ public class TestFuzzyQuery extends LuceneTestCase { + public void testBasicPrefix() throws Exception { + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory); + addDoc("abc", writer); + IndexReader reader = writer.getReader(); + IndexSearcher searcher = newSearcher(reader); + writer.shutdown(); + + FuzzyQuery query = new FuzzyQuery(new Term("field", "abc"), FuzzyQuery.defaultMaxEdits, 1); + ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(1, hits.length); + reader.close(); + directory.close(); + } + public void testFuzziness() throws Exception { Directory directory = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), directory); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java index 05ea7a1249db..6ed31407824a 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java @@ -27,10 +27,10 @@ import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.automaton.Automata; +import org.apache.lucene.util.automaton.Operations; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.AutomatonProvider; -import org.apache.lucene.util.automaton.BasicAutomata; -import org.apache.lucene.util.automaton.BasicOperations; import org.apache.lucene.util.automaton.RegExp; /** @@ -97,10 +97,10 @@ public void testRegexComplement() throws IOException { public void testCustomProvider() throws IOException { AutomatonProvider myProvider = new AutomatonProvider() { // automaton that matches quick or brown - private Automaton quickBrownAutomaton = BasicOperations.union(Arrays - .asList(BasicAutomata.makeString("quick"), - BasicAutomata.makeString("brown"), - BasicAutomata.makeString("bob"))); + private Automaton quickBrownAutomaton = Operations.union(Arrays + .asList(Automata.makeString("quick"), + Automata.makeString("brown"), + Automata.makeString("bob"))); @Override public Automaton getAutomaton(String name) { @@ -108,8 +108,7 @@ public Automaton getAutomaton(String name) { else return null; } }; - RegexpQuery query = new RegexpQuery(newTerm(""), RegExp.ALL, - myProvider); + RegexpQuery query = new RegexpQuery(newTerm(""), RegExp.ALL, myProvider); assertEquals(1, searcher.search(query, 5).totalHits); } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestRegexpRandom2.java b/lucene/core/src/test/org/apache/lucene/search/TestRegexpRandom2.java index f6f2be586f97..ead284b90223 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestRegexpRandom2.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestRegexpRandom2.java @@ -40,9 +40,9 @@ import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.TestUtil; import org.apache.lucene.util.UnicodeUtil; -import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.AutomatonTestUtil; import org.apache.lucene.util.automaton.CharacterRunAutomaton; +import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.RegExp; /** diff --git a/lucene/core/src/test/org/apache/lucene/search/TestWildcard.java b/lucene/core/src/test/org/apache/lucene/search/TestWildcard.java index 5d65451714d4..195c3fdc5344 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestWildcard.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestWildcard.java @@ -268,7 +268,7 @@ private void assertMatches(IndexSearcher searcher, Query q, int expectedMatches) * Test that wild card queries are parsed to the correct type and are searched correctly. * This test looks at both parsing and execution of wildcard queries. * Although placed here, it also tests prefix queries, verifying that - * prefix queries are not parsed into wild card queries, and viceversa. + * prefix queries are not parsed into wild card queries, and vice-versa. */ public void testParsingAndSearching() throws Exception { String field = "content"; diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java new file mode 100644 index 000000000000..fe0842fb6f7f --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java @@ -0,0 +1,1065 @@ +package org.apache.lucene.util.automaton; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Set; + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TestUtil; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.automaton.AutomatonTestUtil.RandomAcceptedStrings; +import org.apache.lucene.util.fst.Util; + +public class TestAutomaton extends LuceneTestCase { + + public void testBasic() throws Exception { + Automaton a = new Automaton(); + int start = a.createState(); + int x = a.createState(); + int y = a.createState(); + int end = a.createState(); + a.setAccept(end, true); + + a.addTransition(start, x, 'a', 'a'); + a.addTransition(start, end, 'd', 'd'); + a.addTransition(x, y, 'b', 'b'); + a.addTransition(y, end, 'c', 'c'); + a.finishState(); + } + + public void testReduceBasic() throws Exception { + Automaton a = new Automaton(); + int start = a.createState(); + int end = a.createState(); + a.setAccept(end, true); + // Should collapse to a-b: + a.addTransition(start, end, 'a', 'a'); + a.addTransition(start, end, 'b', 'b'); + a.addTransition(start, end, 'm', 'm'); + // Should collapse to x-y: + a.addTransition(start, end, 'x', 'x'); + a.addTransition(start, end, 'y', 'y'); + + a.finishState(); + assertEquals(3, a.getNumTransitions(start)); + Transition scratch = new Transition(); + a.initTransition(start, scratch); + a.getNextTransition(scratch); + assertEquals('a', scratch.min); + assertEquals('b', scratch.max); + a.getNextTransition(scratch); + assertEquals('m', scratch.min); + assertEquals('m', scratch.max); + a.getNextTransition(scratch); + assertEquals('x', scratch.min); + assertEquals('y', scratch.max); + } + + public void testSameLanguage() throws Exception { + Automaton a1 = Automata.makeString("foobar"); + Automaton a2 = Operations.removeDeadStates(Operations.concatenate( + Automata.makeString("foo"), + Automata.makeString("bar"))); + assertTrue(Operations.sameLanguage(a1, a2)); + } + + public void testCommonPrefix() throws Exception { + Automaton a = Operations.concatenate( + Automata.makeString("foobar"), + Automata.makeAnyString()); + assertEquals("foobar", Operations.getCommonPrefix(a)); + } + + public void testConcatenate1() throws Exception { + Automaton a = Operations.concatenate( + Automata.makeString("m"), + Automata.makeAnyString()); + assertTrue(Operations.run(a, "m")); + assertTrue(Operations.run(a, "me")); + assertTrue(Operations.run(a, "me too")); + } + + public void testConcatenate2() throws Exception { + Automaton a = Operations.concatenate(Arrays.asList( + Automata.makeString("m"), + Automata.makeAnyString(), + Automata.makeString("n"), + Automata.makeAnyString())); + a = Operations.determinize(a); + assertTrue(Operations.run(a, "mn")); + assertTrue(Operations.run(a, "mone")); + assertFalse(Operations.run(a, "m")); + assertFalse(Operations.isFinite(a)); + } + + public void testUnion1() throws Exception { + Automaton a = Operations.union(Arrays.asList( + Automata.makeString("foobar"), + Automata.makeString("barbaz"))); + a = Operations.determinize(a); + assertTrue(Operations.run(a, "foobar")); + assertTrue(Operations.run(a, "barbaz")); + + assertMatches(a, "foobar", "barbaz"); + } + + public void testUnion2() throws Exception { + Automaton a = Operations.union(Arrays.asList( + Automata.makeString("foobar"), + Automata.makeString(""), + Automata.makeString("barbaz"))); + a = Operations.determinize(a); + assertTrue(Operations.run(a, "foobar")); + assertTrue(Operations.run(a, "barbaz")); + assertTrue(Operations.run(a, "")); + + assertMatches(a, "", "foobar", "barbaz"); + } + + public void testMinimizeSimple() throws Exception { + Automaton a = Automata.makeString("foobar"); + Automaton aMin = MinimizationOperations.minimize(a); + + assertTrue(Operations.sameLanguage(a, aMin)); + } + + public void testMinimize2() throws Exception { + Automaton a = Operations.union(Arrays.asList(Automata.makeString("foobar"), + Automata.makeString("boobar"))); + Automaton aMin = MinimizationOperations.minimize(a); + assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(a)), aMin)); + } + + public void testReverse() throws Exception { + Automaton a = Automata.makeString("foobar"); + Automaton ra = Operations.reverse(a); + Automaton a2 = Operations.determinize(Operations.reverse(ra)); + + assertTrue(Operations.sameLanguage(a, a2)); + } + + public void testOptional() throws Exception { + Automaton a = Automata.makeString("foobar"); + Automaton a2 = Operations.optional(a); + a2 = Operations.determinize(a2); + + assertTrue(Operations.run(a, "foobar")); + assertFalse(Operations.run(a, "")); + assertTrue(Operations.run(a2, "foobar")); + assertTrue(Operations.run(a2, "")); + } + + public void testRepeatAny() throws Exception { + Automaton a = Automata.makeString("zee"); + Automaton a2 = Operations.determinize(Operations.repeat(a)); + assertTrue(Operations.run(a2, "")); + assertTrue(Operations.run(a2, "zee")); + assertTrue(Operations.run(a2, "zeezee")); + assertTrue(Operations.run(a2, "zeezeezee")); + } + + public void testRepeatMin() throws Exception { + Automaton a = Automata.makeString("zee"); + Automaton a2 = Operations.determinize(Operations.repeat(a, 2)); + assertFalse(Operations.run(a2, "")); + assertFalse(Operations.run(a2, "zee")); + assertTrue(Operations.run(a2, "zeezee")); + assertTrue(Operations.run(a2, "zeezeezee")); + } + + public void testRepeatMinMax1() throws Exception { + Automaton a = Automata.makeString("zee"); + Automaton a2 = Operations.determinize(Operations.repeat(a, 0, 2)); + assertTrue(Operations.run(a2, "")); + assertTrue(Operations.run(a2, "zee")); + assertTrue(Operations.run(a2, "zeezee")); + assertFalse(Operations.run(a2, "zeezeezee")); + } + + public void testRepeatMinMax2() throws Exception { + Automaton a = Automata.makeString("zee"); + Automaton a2 = Operations.determinize(Operations.repeat(a, 2, 4)); + assertFalse(Operations.run(a2, "")); + assertFalse(Operations.run(a2, "zee")); + assertTrue(Operations.run(a2, "zeezee")); + assertTrue(Operations.run(a2, "zeezeezee")); + assertTrue(Operations.run(a2, "zeezeezeezee")); + assertFalse(Operations.run(a2, "zeezeezeezeezee")); + } + + public void testComplement() throws Exception { + Automaton a = Automata.makeString("zee"); + Automaton a2 = Operations.determinize(Operations.complement(a)); + assertTrue(Operations.run(a2, "")); + assertFalse(Operations.run(a2, "zee")); + assertTrue(Operations.run(a2, "zeezee")); + assertTrue(Operations.run(a2, "zeezeezee")); + } + + public void testInterval() throws Exception { + Automaton a = Operations.determinize(Automata.makeInterval(17, 100, 3)); + assertFalse(Operations.run(a, "")); + assertTrue(Operations.run(a, "017")); + assertTrue(Operations.run(a, "100")); + assertTrue(Operations.run(a, "073")); + } + + public void testCommonSuffix() throws Exception { + Automaton a = new Automaton(); + int init = a.createState(); + int fini = a.createState(); + a.setAccept(init, true); + a.setAccept(fini, true); + a.addTransition(init, fini, 'm'); + a.addTransition(fini, fini, 'm'); + a.finishState(); + assertEquals(0, Operations.getCommonSuffixBytesRef(a).length); + } + + public void testReverseRandom1() throws Exception { + int ITERS = atLeast(100); + for(int i=0;i allTrans = new ArrayList<>(); + int numStates = a.getNumStates(); + for(int s=0;s 0) { + mins = prefix.substring(mins.length()) + mins; + maxs = prefix.substring(maxs.length()) + maxs; + } + assertTrue(Operations.run(a, mins)); + assertTrue(Operations.run(a, maxs)); + + for(int iter2=0;iter2<100;iter2++) { + int x = random().nextInt(2*max); + boolean expected = x >= min && x <= max; + String sx = Integer.toString(x); + if (sx.length() < digits) { + // Left-fill with 0s + sx = b.substring(sx.length()) + sx; + } else if (digits == 0) { + // Left-fill with random number of 0s: + int numZeros = random().nextInt(10); + StringBuilder sb = new StringBuilder(); + for(int i=0;i expected = new HashSet<>(); + for(String s : strings) { + IntsRef ints = new IntsRef(); + expected.add(Util.toUTF32(s, ints)); + } + + assertEquals(expected, Operations.getFiniteStrings(Operations.determinize(a), -1)); + } + + public void testConcatenatePreservesDet() throws Exception { + Automaton a1 = Automata.makeString("foobar"); + assertTrue(a1.isDeterministic()); + Automaton a2 = Automata.makeString("baz"); + assertTrue(a2.isDeterministic()); + assertTrue((Operations.concatenate(Arrays.asList(a1, a2)).isDeterministic())); + } + + public void testRemoveDeadStates() throws Exception { + Automaton a = Operations.concatenate(Arrays.asList(Automata.makeString("x"), + Automata.makeString("y"))); + assertEquals(4, a.getNumStates()); + a = Operations.removeDeadStates(a); + assertEquals(3, a.getNumStates()); + } + + public void testRemoveDeadStatesEmpty1() throws Exception { + Automaton a = new Automaton(); + a.finishState(); + assertTrue(Operations.isEmpty(a)); + assertTrue(Operations.isEmpty(Operations.removeDeadStates(a))); + } + + public void testRemoveDeadStatesEmpty2() throws Exception { + Automaton a = new Automaton(); + a.finishState(); + assertTrue(Operations.isEmpty(a)); + assertTrue(Operations.isEmpty(Operations.removeDeadStates(a))); + } + + public void testRemoveDeadStatesEmpty3() throws Exception { + Automaton a = new Automaton(); + int init = a.createState(); + int fini = a.createState(); + a.addTransition(init, fini, 'a'); + Automaton a2 = Operations.removeDeadStates(a); + assertEquals(0, a2.getNumStates()); + } + + public void testConcatEmpty() throws Exception { + // If you concat empty automaton to anything the result should still be empty: + Automaton a = Operations.concatenate(Automata.makeEmpty(), + Automata.makeString("foo")); + assertEquals(new HashSet(), Operations.getFiniteStrings(a, -1)); + + a = Operations.concatenate(Automata.makeString("foo"), + Automata.makeEmpty()); + assertEquals(new HashSet(), Operations.getFiniteStrings(a, -1)); + } + + public void testSeemsNonEmptyButIsNot1() throws Exception { + Automaton a = new Automaton(); + // Init state has a transition but doesn't lead to accept + int init = a.createState(); + int s = a.createState(); + a.addTransition(init, s, 'a'); + a.finishState(); + assertTrue(Operations.isEmpty(a)); + } + + public void testSeemsNonEmptyButIsNot2() throws Exception { + Automaton a = new Automaton(); + int init = a.createState(); + int s = a.createState(); + a.addTransition(init, s, 'a'); + // An orphan'd accept state + s = a.createState(); + a.setAccept(s, true); + a.finishState(); + assertTrue(Operations.isEmpty(a)); + } + + public void testSameLanguage1() throws Exception { + Automaton a = Automata.makeEmptyString(); + Automaton a2 = Automata.makeEmptyString(); + int state = a2.createState(); + a2.addTransition(0, state, 'a'); + a2.finishState(); + assertTrue(Operations.sameLanguage(Operations.removeDeadStates(a), + Operations.removeDeadStates(a2))); + } + + private Automaton randomNoOp(Automaton a) { + switch (random().nextInt(7)) { + case 0: + if (VERBOSE) { + System.out.println(" randomNoOp: determinize"); + } + return Operations.determinize(a); + case 1: + if (VERBOSE) { + System.out.println(" randomNoOp: minimize"); + } + return MinimizationOperations.minimize(a); + case 2: + if (VERBOSE) { + System.out.println(" randomNoOp: removeDeadStates"); + } + return Operations.removeDeadStates(a); + case 3: + if (VERBOSE) { + System.out.println(" randomNoOp: reverse reverse"); + } + a = Operations.reverse(a); + a = randomNoOp(a); + return Operations.reverse(a); + case 4: + if (VERBOSE) { + System.out.println(" randomNoOp: concat empty string"); + } + return Operations.concatenate(a, Automata.makeEmptyString()); + case 5: + if (VERBOSE) { + System.out.println(" randomNoOp: union empty automaton"); + } + return Operations.union(a, Automata.makeEmpty()); + case 6: + if (VERBOSE) { + System.out.println(" randomNoOp: do nothing!"); + } + return a; + } + assert false; + return null; + } + + private Automaton unionTerms(Collection terms) { + Automaton a; + if (random().nextBoolean()) { + if (VERBOSE) { + System.out.println("TEST: unionTerms: use union"); + } + List as = new ArrayList<>(); + for(BytesRef term : terms) { + as.add(Automata.makeString(term.utf8ToString())); + } + a = Operations.union(as); + } else { + if (VERBOSE) { + System.out.println("TEST: unionTerms: use makeStringUnion"); + } + List termsList = new ArrayList<>(terms); + Collections.sort(termsList); + a = Automata.makeStringUnion(termsList); + } + + return randomNoOp(a); + } + + private String getRandomString() { + //return TestUtil.randomSimpleString(random()); + return TestUtil.randomRealisticUnicodeString(random()); + } + + public void testRandomFinite() throws Exception { + + int numTerms = atLeast(10); + int iters = atLeast(100); + + if (VERBOSE) { + System.out.println("TEST: numTerms" + numTerms + " iters=" + iters); + } + + Set terms = new HashSet<>(); + while (terms.size() < numTerms) { + terms.add(new BytesRef(getRandomString())); + } + + Automaton a = unionTerms(terms); + assertSame(terms, a); + + for(int iter=0;iter newTerms = new HashSet<>(); + BytesRef prefix = new BytesRef(getRandomString()); + for(BytesRef term : terms) { + BytesRef newTerm = BytesRef.deepCopyOf(prefix); + newTerm.append(term); + newTerms.add(newTerm); + } + terms = newTerms; + boolean wasDeterministic1 = a.isDeterministic(); + a = Operations.concatenate(Automata.makeString(prefix.utf8ToString()), a); + assertEquals(wasDeterministic1, a.isDeterministic()); + } + break; + + case 1: + // concatenate suffix + { + BytesRef suffix = new BytesRef(getRandomString()); + if (VERBOSE) { + System.out.println(" op=concat suffix " + suffix); + } + Set newTerms = new HashSet<>(); + for(BytesRef term : terms) { + BytesRef newTerm = BytesRef.deepCopyOf(term); + newTerm.append(suffix); + newTerms.add(newTerm); + } + terms = newTerms; + a = Operations.concatenate(a, Automata.makeString(suffix.utf8ToString())); + } + break; + + case 2: + // determinize + if (VERBOSE) { + System.out.println(" op=determinize"); + } + a = Operations.determinize(a); + assertTrue(a.isDeterministic()); + break; + + case 3: + if (VERBOSE) { + System.out.println(" op=minimize"); + } + // minimize + a = MinimizationOperations.minimize(a); + break; + + case 4: + // union + { + if (VERBOSE) { + System.out.println(" op=union"); + } + Set newTerms = new HashSet<>(); + int numNewTerms = random().nextInt(5); + while (newTerms.size() < numNewTerms) { + newTerms.add(new BytesRef(getRandomString())); + } + terms.addAll(newTerms); + Automaton newA = unionTerms(newTerms); + a = Operations.union(a, newA); + } + break; + + case 5: + // optional + { + if (VERBOSE) { + System.out.println(" op=optional"); + } + a = Operations.optional(a); + terms.add(new BytesRef()); + } + break; + + case 6: + // minus finite + { + if (VERBOSE) { + System.out.println(" op=minus finite"); + } + if (terms.size() > 0) { + RandomAcceptedStrings rasl = new RandomAcceptedStrings(Operations.removeDeadStates(a)); + Set toRemove = new HashSet<>(); + int numToRemove = TestUtil.nextInt(random(), 1, (terms.size()+1)/2); + while (toRemove.size() < numToRemove) { + int[] ints = rasl.getRandomAcceptedString(random()); + BytesRef term = new BytesRef(UnicodeUtil.newString(ints, 0, ints.length)); + if (toRemove.contains(term) == false) { + toRemove.add(term); + } + } + for(BytesRef term : toRemove) { + boolean removed = terms.remove(term); + assertTrue(removed); + } + Automaton a2 = unionTerms(toRemove); + a = Operations.minus(a, a2); + } + } + break; + + case 7: + { + // minus infinite + List as = new ArrayList<>(); + int count = TestUtil.nextInt(random(), 1, 5); + Set prefixes = new HashSet<>(); + while(prefixes.size() < count) { + // prefix is a leading ascii byte; we remove * from a + int prefix = random().nextInt(128); + prefixes.add(prefix); + } + + if (VERBOSE) { + System.out.println(" op=minus infinite prefixes=" + prefixes); + } + + for(int prefix : prefixes) { + // prefix is a leading ascii byte; we remove * from a + Automaton a2 = new Automaton(); + int init = a2.createState(); + int state = a2.createState(); + a2.addTransition(init, state, prefix); + a2.setAccept(state, true); + a2.addTransition(state, state, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT); + a2.finishState(); + as.add(a2); + Iterator it = terms.iterator(); + while (it.hasNext()) { + BytesRef term = it.next(); + if (term.length > 0 && (term.bytes[term.offset] & 0xFF) == prefix) { + it.remove(); + } + } + } + Automaton a2 = randomNoOp(Operations.union(as)); + a = Operations.minus(a, a2); + } + break; + + case 8: + { + int count = TestUtil.nextInt(random(), 10, 20); + if (VERBOSE) { + System.out.println(" op=intersect infinite count=" + count); + } + // intersect infinite + List as = new ArrayList<>(); + + Set prefixes = new HashSet<>(); + while(prefixes.size() < count) { + int prefix = random().nextInt(128); + prefixes.add(prefix); + } + if (VERBOSE) { + System.out.println(" prefixes=" + prefixes); + } + + for(int prefix : prefixes) { + // prefix is a leading ascii byte; we retain * in a + Automaton a2 = new Automaton(); + int init = a2.createState(); + int state = a2.createState(); + a2.addTransition(init, state, prefix); + a2.setAccept(state, true); + a2.addTransition(state, state, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT); + a2.finishState(); + as.add(a2); + prefixes.add(prefix); + } + + Automaton a2 = Operations.union(as); + if (random().nextBoolean()) { + a2 = Operations.determinize(a2); + } else if (random().nextBoolean()) { + a2 = MinimizationOperations.minimize(a2); + } + a = Operations.intersection(a, a2); + + Iterator it = terms.iterator(); + while (it.hasNext()) { + BytesRef term = it.next(); + if (term.length == 0 || prefixes.contains(term.bytes[term.offset]&0xff) == false) { + if (VERBOSE) { + System.out.println(" drop term=" + term); + } + it.remove(); + } else { + if (VERBOSE) { + System.out.println(" keep term=" + term); + } + } + } + } + break; + + case 9: + // reverse + { + if (VERBOSE) { + System.out.println(" op=reverse"); + } + a = Operations.reverse(a); + Set newTerms = new HashSet<>(); + for(BytesRef term : terms) { + newTerms.add(new BytesRef(new StringBuilder(term.utf8ToString()).reverse().toString())); + } + terms = newTerms; + } + break; + + case 10: + if (VERBOSE) { + System.out.println(" op=randomNoOp"); + } + a = randomNoOp(a); + break; + + case 11: + // interval + { + int min = random().nextInt(1000); + int max = min + random().nextInt(50); + // digits must be non-zero else we make cycle + int digits = Integer.toString(max).length(); + if (VERBOSE) { + System.out.println(" op=union interval min=" + min + " max=" + max + " digits=" + digits); + } + a = Operations.union(a, Automata.makeInterval(min, max, digits)); + StringBuilder b = new StringBuilder(); + for(int i=0;i addTerms = new HashSet<>(); + while (addTerms.size() < count) { + addTerms.add(new BytesRef(getRandomString())); + } + if (VERBOSE) { + for(BytesRef term : addTerms) { + System.out.println(" term=" + term); + } + } + Automaton a2 = unionTerms(addTerms); + Set newTerms = new HashSet<>(); + if (random().nextBoolean()) { + // suffix + if (VERBOSE) { + System.out.println(" do suffix"); + } + a = Operations.concatenate(a, randomNoOp(a2)); + for(BytesRef term : terms) { + for(BytesRef suffix : addTerms) { + BytesRef newTerm = BytesRef.deepCopyOf(term); + newTerm.append(suffix); + newTerms.add(newTerm); + } + } + } else { + // prefix + if (VERBOSE) { + System.out.println(" do prefix"); + } + a = Operations.concatenate(randomNoOp(a2), a); + for(BytesRef term : terms) { + for(BytesRef prefix : addTerms) { + BytesRef newTerm = BytesRef.deepCopyOf(prefix); + newTerm.append(term); + newTerms.add(newTerm); + } + } + } + + terms = newTerms; + } + break; + } + + // assertSame(terms, a); + assertEquals(AutomatonTestUtil.isDeterministicSlow(a), a.isDeterministic()); + } + + assertSame(terms, a); + } + + private void assertSame(Collection terms, Automaton a) { + + try { + assertTrue(Operations.isFinite(a)); + assertFalse(Operations.isTotal(a)); + + Automaton detA = Operations.determinize(a); + + // Make sure all terms are accepted: + IntsRef scratch = new IntsRef(); + for(BytesRef term : terms) { + Util.toIntsRef(term, scratch); + assertTrue("failed to accept term=" + term.utf8ToString(), Operations.run(detA, term.utf8ToString())); + } + + // Use getFiniteStrings: + Set expected = new HashSet<>(); + for(BytesRef term : terms) { + IntsRef intsRef = new IntsRef(); + Util.toUTF32(term.utf8ToString(), intsRef); + expected.add(intsRef); + } + Set actual = Operations.getFiniteStrings(a, -1); + + if (expected.equals(actual) == false) { + System.out.println("FAILED:"); + for(IntsRef term : expected) { + if (actual.contains(term) == false) { + System.out.println(" term=" + term + " should be accepted but isn't"); + } + } + for(IntsRef term : actual) { + if (expected.contains(term) == false) { + System.out.println(" term=" + term + " is accepted but should not be"); + } + } + throw new AssertionError("mismatch"); + } + + // Use sameLanguage: + Automaton a2 = Operations.removeDeadStates(Operations.determinize(unionTerms(terms))); + assertTrue(Operations.sameLanguage(a2, Operations.removeDeadStates(Operations.determinize(a)))); + + // Do same check, in UTF8 space + Automaton utf8 = randomNoOp(new UTF32ToUTF8().convert(a)); + + Set expected2 = new HashSet<>(); + for(BytesRef term : terms) { + IntsRef intsRef = new IntsRef(); + Util.toIntsRef(term, intsRef); + expected2.add(intsRef); + } + assertEquals(expected2, Operations.getFiniteStrings(utf8, -1)); + } catch (AssertionError ae) { + System.out.println("TEST: FAILED: not same"); + System.out.println(" terms (count=" + terms.size() + "):"); + for(BytesRef term : terms) { + System.out.println(" " + term); + } + System.out.println(" automaton:"); + System.out.println(a.toDot()); + //a.writeDot("fail"); + throw ae; + } + } +} diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestBasicOperations.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestBasicOperations.java deleted file mode 100644 index 1b2e62df3c9e..000000000000 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestBasicOperations.java +++ /dev/null @@ -1,146 +0,0 @@ -package org.apache.lucene.util.automaton; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.util.*; - -import org.apache.lucene.util.*; - -import com.carrotsearch.randomizedtesting.generators.RandomInts; - -public class TestBasicOperations extends LuceneTestCase { - /** Test string union. */ - public void testStringUnion() { - List strings = new ArrayList<>(); - for (int i = RandomInts.randomIntBetween(random(), 0, 1000); --i >= 0;) { - strings.add(new BytesRef(TestUtil.randomUnicodeString(random()))); - } - - Collections.sort(strings); - Automaton union = BasicAutomata.makeStringUnion(strings); - assertTrue(union.isDeterministic()); - assertTrue(BasicOperations.sameLanguage(union, naiveUnion(strings))); - } - - private static Automaton naiveUnion(List strings) { - Automaton [] eachIndividual = new Automaton [strings.size()]; - int i = 0; - for (BytesRef bref : strings) { - eachIndividual[i++] = BasicAutomata.makeString(bref.utf8ToString()); - } - return BasicOperations.union(Arrays.asList(eachIndividual)); - } - - /** Test optimization to concatenate() */ - public void testSingletonConcatenate() { - Automaton singleton = BasicAutomata.makeString("prefix"); - Automaton expandedSingleton = singleton.cloneExpanded(); - Automaton other = BasicAutomata.makeCharRange('5', '7'); - Automaton concat = BasicOperations.concatenate(singleton, other); - assertTrue(concat.isDeterministic()); - assertTrue(BasicOperations.sameLanguage(BasicOperations.concatenate(expandedSingleton, other), concat)); - } - - /** Test optimization to concatenate() to an NFA */ - public void testSingletonNFAConcatenate() { - Automaton singleton = BasicAutomata.makeString("prefix"); - Automaton expandedSingleton = singleton.cloneExpanded(); - // an NFA (two transitions for 't' from initial state) - Automaton nfa = BasicOperations.union(BasicAutomata.makeString("this"), - BasicAutomata.makeString("three")); - Automaton concat = BasicOperations.concatenate(singleton, nfa); - assertFalse(concat.isDeterministic()); - assertTrue(BasicOperations.sameLanguage(BasicOperations.concatenate(expandedSingleton, nfa), concat)); - } - - /** Test optimization to concatenate() with empty String */ - public void testEmptySingletonConcatenate() { - Automaton singleton = BasicAutomata.makeString(""); - Automaton expandedSingleton = singleton.cloneExpanded(); - Automaton other = BasicAutomata.makeCharRange('5', '7'); - Automaton concat1 = BasicOperations.concatenate(expandedSingleton, other); - Automaton concat2 = BasicOperations.concatenate(singleton, other); - assertTrue(concat2.isDeterministic()); - assertTrue(BasicOperations.sameLanguage(concat1, concat2)); - assertTrue(BasicOperations.sameLanguage(other, concat1)); - assertTrue(BasicOperations.sameLanguage(other, concat2)); - } - - /** Test concatenation with empty language returns empty */ - public void testEmptyLanguageConcatenate() { - Automaton a = BasicAutomata.makeString("a"); - Automaton concat = BasicOperations.concatenate(a, BasicAutomata.makeEmpty()); - assertTrue(BasicOperations.isEmpty(concat)); - } - - /** Test optimization to concatenate() with empty String to an NFA */ - public void testEmptySingletonNFAConcatenate() { - Automaton singleton = BasicAutomata.makeString(""); - Automaton expandedSingleton = singleton.cloneExpanded(); - // an NFA (two transitions for 't' from initial state) - Automaton nfa = BasicOperations.union(BasicAutomata.makeString("this"), - BasicAutomata.makeString("three")); - Automaton concat1 = BasicOperations.concatenate(expandedSingleton, nfa); - Automaton concat2 = BasicOperations.concatenate(singleton, nfa); - assertFalse(concat2.isDeterministic()); - assertTrue(BasicOperations.sameLanguage(concat1, concat2)); - assertTrue(BasicOperations.sameLanguage(nfa, concat1)); - assertTrue(BasicOperations.sameLanguage(nfa, concat2)); - } - - /** Test singletons work correctly */ - public void testSingleton() { - Automaton singleton = BasicAutomata.makeString("foobar"); - Automaton expandedSingleton = singleton.cloneExpanded(); - assertTrue(BasicOperations.sameLanguage(singleton, expandedSingleton)); - - singleton = BasicAutomata.makeString("\ud801\udc1c"); - expandedSingleton = singleton.cloneExpanded(); - assertTrue(BasicOperations.sameLanguage(singleton, expandedSingleton)); - } - - public void testGetRandomAcceptedString() throws Throwable { - final int ITER1 = atLeast(100); - final int ITER2 = atLeast(100); - for(int i=0;i 0) { - assertTrue(automata[n-1].subsetOf(automata[n])); - assertTrue(automata[n-1].subsetOf(tautomata[n])); - assertTrue(tautomata[n-1].subsetOf(automata[n])); - assertTrue(tautomata[n-1].subsetOf(tautomata[n])); + assertTrue(Operations.subsetOf(Operations.removeDeadStates(automata[n-1]), + Operations.removeDeadStates(automata[n]))); + assertTrue(Operations.subsetOf(Operations.removeDeadStates(automata[n-1]), + Operations.removeDeadStates(tautomata[n]))); + assertTrue(Operations.subsetOf(Operations.removeDeadStates(tautomata[n-1]), + Operations.removeDeadStates(automata[n]))); + assertTrue(Operations.subsetOf(Operations.removeDeadStates(tautomata[n-1]), + Operations.removeDeadStates(tautomata[n]))); assertNotSame(automata[n-1], automata[n]); } // check that Lev(N) is a subset of LevT(N) - assertTrue(automata[n].subsetOf(tautomata[n])); + assertTrue(Operations.subsetOf(Operations.removeDeadStates(automata[n]), + Operations.removeDeadStates(tautomata[n]))); // special checks for specific n switch(n) { case 0: // easy, matches the string itself - assertTrue(BasicOperations.sameLanguage(BasicAutomata.makeString(s), automata[0])); - assertTrue(BasicOperations.sameLanguage(BasicAutomata.makeString(s), tautomata[0])); + assertTrue(Operations.sameLanguage(Automata.makeString(s), Operations.removeDeadStates(automata[0]))); + assertTrue(Operations.sameLanguage(Automata.makeString(s), Operations.removeDeadStates(tautomata[0]))); break; case 1: // generate a lev1 naively, and check the accepted lang is the same. - assertTrue(BasicOperations.sameLanguage(naiveLev1(s), automata[1])); - assertTrue(BasicOperations.sameLanguage(naiveLev1T(s), tautomata[1])); + assertTrue(Operations.sameLanguage(naiveLev1(s), Operations.removeDeadStates(automata[1]))); + assertTrue(Operations.sameLanguage(naiveLev1T(s), Operations.removeDeadStates(tautomata[1]))); break; default: assertBruteForce(s, automata[n], n); @@ -114,13 +119,13 @@ private void assertLev(String s, int maxDistance) { * substitutions of s. */ private Automaton naiveLev1(String s) { - Automaton a = BasicAutomata.makeString(s); - a = BasicOperations.union(a, insertionsOf(s)); - MinimizationOperations.minimize(a); - a = BasicOperations.union(a, deletionsOf(s)); - MinimizationOperations.minimize(a); - a = BasicOperations.union(a, substitutionsOf(s)); - MinimizationOperations.minimize(a); + Automaton a = Automata.makeString(s); + a = Operations.union(a, insertionsOf(s)); + a = MinimizationOperations.minimize(a); + a = Operations.union(a, deletionsOf(s)); + a = MinimizationOperations.minimize(a); + a = Operations.union(a, substitutionsOf(s)); + a = MinimizationOperations.minimize(a); return a; } @@ -131,8 +136,8 @@ private Automaton naiveLev1(String s) { */ private Automaton naiveLev1T(String s) { Automaton a = naiveLev1(s); - a = BasicOperations.union(a, transpositionsOf(s)); - MinimizationOperations.minimize(a); + a = Operations.union(a, transpositionsOf(s)); + a = MinimizationOperations.minimize(a); return a; } @@ -144,15 +149,14 @@ private Automaton insertionsOf(String s) { List list = new ArrayList<>(); for (int i = 0; i <= s.length(); i++) { - Automaton a = BasicAutomata.makeString(s.substring(0, i)); - a = BasicOperations.concatenate(a, BasicAutomata.makeAnyChar()); - a = BasicOperations.concatenate(a, BasicAutomata.makeString(s - .substring(i))); + Automaton a = Automata.makeString(s.substring(0, i)); + a = Operations.concatenate(a, Automata.makeAnyChar()); + a = Operations.concatenate(a, Automata.makeString(s.substring(i))); list.add(a); } - Automaton a = BasicOperations.union(list); - MinimizationOperations.minimize(a); + Automaton a = Operations.union(list); + a = MinimizationOperations.minimize(a); return a; } @@ -164,15 +168,13 @@ private Automaton deletionsOf(String s) { List list = new ArrayList<>(); for (int i = 0; i < s.length(); i++) { - Automaton a = BasicAutomata.makeString(s.substring(0, i)); - a = BasicOperations.concatenate(a, BasicAutomata.makeString(s - .substring(i + 1))); - a.expandSingleton(); + Automaton a = Automata.makeString(s.substring(0, i)); + a = Operations.concatenate(a, Automata.makeString(s.substring(i + 1))); list.add(a); } - Automaton a = BasicOperations.union(list); - MinimizationOperations.minimize(a); + Automaton a = Operations.union(list); + a = MinimizationOperations.minimize(a); return a; } @@ -184,15 +186,14 @@ private Automaton substitutionsOf(String s) { List list = new ArrayList<>(); for (int i = 0; i < s.length(); i++) { - Automaton a = BasicAutomata.makeString(s.substring(0, i)); - a = BasicOperations.concatenate(a, BasicAutomata.makeAnyChar()); - a = BasicOperations.concatenate(a, BasicAutomata.makeString(s - .substring(i + 1))); + Automaton a = Automata.makeString(s.substring(0, i)); + a = Operations.concatenate(a, Automata.makeAnyChar()); + a = Operations.concatenate(a, Automata.makeString(s.substring(i + 1))); list.add(a); } - Automaton a = BasicOperations.union(list); - MinimizationOperations.minimize(a); + Automaton a = Operations.union(list); + a = MinimizationOperations.minimize(a); return a; } @@ -201,8 +202,9 @@ private Automaton substitutionsOf(String s) { * (transposing two adjacent characters) */ private Automaton transpositionsOf(String s) { - if (s.length() < 2) - return BasicAutomata.makeEmpty(); + if (s.length() < 2) { + return Automata.makeEmpty(); + } List list = new ArrayList<>(); for (int i = 0; i < s.length()-1; i++) { StringBuilder sb = new StringBuilder(); @@ -211,11 +213,12 @@ private Automaton transpositionsOf(String s) { sb.append(s.charAt(i)); sb.append(s.substring(i+2, s.length())); String st = sb.toString(); - if (!st.equals(s)) - list.add(BasicAutomata.makeString(st)); + if (!st.equals(s)) { + list.add(Automata.makeString(st)); + } } - Automaton a = BasicOperations.union(list); - MinimizationOperations.minimize(a); + Automaton a = Operations.union(list); + a = MinimizationOperations.minimize(a); return a; } diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestMinimize.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestMinimize.java index e306253b0adf..82a2914f43bb 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestMinimize.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestMinimize.java @@ -24,13 +24,13 @@ */ public class TestMinimize extends LuceneTestCase { /** the minimal and non-minimal are compared to ensure they are the same. */ - public void test() { + public void testBasic() { int num = atLeast(200); for (int i = 0; i < num; i++) { Automaton a = AutomatonTestUtil.randomAutomaton(random()); - Automaton b = a.clone(); - MinimizationOperations.minimize(b); - assertTrue(BasicOperations.sameLanguage(a, b)); + Automaton la = Operations.determinize(Operations.removeDeadStates(a)); + Automaton lb = MinimizationOperations.minimize(a); + assertTrue(Operations.sameLanguage(la, lb)); } } @@ -41,12 +41,22 @@ public void testAgainstBrzozowski() { int num = atLeast(200); for (int i = 0; i < num; i++) { Automaton a = AutomatonTestUtil.randomAutomaton(random()); - AutomatonTestUtil.minimizeSimple(a); - Automaton b = a.clone(); - MinimizationOperations.minimize(b); - assertTrue(BasicOperations.sameLanguage(a, b)); - assertEquals(a.getNumberOfStates(), b.getNumberOfStates()); - assertEquals(a.getNumberOfTransitions(), b.getNumberOfTransitions()); + a = AutomatonTestUtil.minimizeSimple(a); + Automaton b = MinimizationOperations.minimize(a); + assertTrue(Operations.sameLanguage(a, b)); + assertEquals(a.getNumStates(), b.getNumStates()); + int numStates = a.getNumStates(); + + int sum1 = 0; + for(int s=0;s strings = new ArrayList<>(); + for (int i = RandomInts.randomIntBetween(random(), 0, 1000); --i >= 0;) { + strings.add(new BytesRef(TestUtil.randomUnicodeString(random()))); + } + + Collections.sort(strings); + Automaton union = Automata.makeStringUnion(strings); + assertTrue(union.isDeterministic()); + assertTrue(Operations.sameLanguage(union, naiveUnion(strings))); + } + + private static Automaton naiveUnion(List strings) { + Automaton[] eachIndividual = new Automaton[strings.size()]; + int i = 0; + for (BytesRef bref : strings) { + eachIndividual[i++] = Automata.makeString(bref.utf8ToString()); + } + return Operations.determinize(Operations.union(Arrays.asList(eachIndividual))); + } + + /** Test concatenation with empty language returns empty */ + public void testEmptyLanguageConcatenate() { + Automaton a = Automata.makeString("a"); + Automaton concat = Operations.concatenate(a, Automata.makeEmpty()); + assertTrue(Operations.isEmpty(concat)); + } + + /** Test optimization to concatenate() with empty String to an NFA */ + public void testEmptySingletonNFAConcatenate() { + Automaton singleton = Automata.makeString(""); + Automaton expandedSingleton = singleton; + // an NFA (two transitions for 't' from initial state) + Automaton nfa = Operations.union(Automata.makeString("this"), + Automata.makeString("three")); + Automaton concat1 = Operations.concatenate(expandedSingleton, nfa); + Automaton concat2 = Operations.concatenate(singleton, nfa); + assertFalse(concat2.isDeterministic()); + assertTrue(Operations.sameLanguage(Operations.determinize(concat1), + Operations.determinize(concat2))); + assertTrue(Operations.sameLanguage(Operations.determinize(nfa), + Operations.determinize(concat1))); + assertTrue(Operations.sameLanguage(Operations.determinize(nfa), + Operations.determinize(concat2))); + } + + public void testGetRandomAcceptedString() throws Throwable { + final int ITER1 = atLeast(100); + final int ITER2 = atLeast(100); + for(int i=0;i getFiniteStrings(Automaton a, int limit, boolean testRecursive) { - Set result = SpecialOperations.getFiniteStrings(a, limit); + Set result = Operations.getFiniteStrings(a, limit); if (testRecursive) { assertEquals(AutomatonTestUtil.getFiniteStringsRecursive(a, limit), result); } @@ -56,8 +128,8 @@ private Set getFiniteStrings(Automaton a, int limit, boolean testRecurs * Basic test for getFiniteStrings */ public void testFiniteStringsBasic() { - Automaton a = BasicOperations.union(BasicAutomata.makeString("dog"), BasicAutomata.makeString("duck")); - MinimizationOperations.minimize(a); + Automaton a = Operations.union(Automata.makeString("dog"), Automata.makeString("duck")); + a = MinimizationOperations.minimize(a); Set strings = getFiniteStrings(a, -1, true); assertEquals(2, strings.size()); IntsRef dog = new IntsRef(); @@ -74,7 +146,7 @@ public void testFiniteStringsEatsStack() { String bigString1 = new String(chars); TestUtil.randomFixedLengthUnicodeString(random(), chars, 0, chars.length); String bigString2 = new String(chars); - Automaton a = BasicOperations.union(BasicAutomata.makeString(bigString1), BasicAutomata.makeString(bigString2)); + Automaton a = Operations.union(Automata.makeString(bigString1), Automata.makeString(bigString2)); Set strings = getFiniteStrings(a, -1, false); assertEquals(2, strings.size()); IntsRef scratch = new IntsRef(); @@ -92,10 +164,10 @@ public void testRandomFiniteStrings1() { } Set strings = new HashSet(); - List automata = new ArrayList(); + List automata = new ArrayList<>(); for(int i=0;i actual = getFiniteStrings(a, -1, true); @@ -158,7 +225,7 @@ private static String toString(IntsRef ints) { public void testWithCycle() throws Exception { try { - SpecialOperations.getFiniteStrings(new RegExp("abc.*", RegExp.NONE).toAutomaton(), -1); + Operations.getFiniteStrings(new RegExp("abc.*", RegExp.NONE).toAutomaton(), -1); fail("did not hit exception"); } catch (IllegalArgumentException iae) { // expected @@ -174,12 +241,12 @@ public void testRandomFiniteStrings2() { try { // Must pass a limit because the random automaton // can accept MANY strings: - SpecialOperations.getFiniteStrings(a, TestUtil.nextInt(random(), 1, 1000)); + Operations.getFiniteStrings(a, TestUtil.nextInt(random(), 1, 1000)); // NOTE: cannot do this, because the method is not // guaranteed to detect cycles when you have a limit - //assertTrue(SpecialOperations.isFinite(a)); + //assertTrue(Operations.isFinite(a)); } catch (IllegalArgumentException iae) { - assertFalse(SpecialOperations.isFinite(a)); + assertFalse(Operations.isFinite(a)); } } } @@ -187,7 +254,7 @@ public void testRandomFiniteStrings2() { public void testInvalidLimit() { Automaton a = AutomatonTestUtil.randomAutomaton(random()); try { - SpecialOperations.getFiniteStrings(a, -7); + Operations.getFiniteStrings(a, -7); fail("did not hit exception"); } catch (IllegalArgumentException iae) { // expected @@ -197,7 +264,7 @@ public void testInvalidLimit() { public void testInvalidLimit2() { Automaton a = AutomatonTestUtil.randomAutomaton(random()); try { - SpecialOperations.getFiniteStrings(a, 0); + Operations.getFiniteStrings(a, 0); fail("did not hit exception"); } catch (IllegalArgumentException iae) { // expected @@ -205,7 +272,7 @@ public void testInvalidLimit2() { } public void testSingletonNoLimit() { - Set result = SpecialOperations.getFiniteStrings(BasicAutomata.makeString("foobar"), -1); + Set result = Operations.getFiniteStrings(Automata.makeString("foobar"), -1); assertEquals(1, result.size()); IntsRef scratch = new IntsRef(); Util.toUTF32("foobar".toCharArray(), 0, 6, scratch); @@ -213,7 +280,7 @@ public void testSingletonNoLimit() { } public void testSingletonLimit1() { - Set result = SpecialOperations.getFiniteStrings(BasicAutomata.makeString("foobar"), 1); + Set result = Operations.getFiniteStrings(Automata.makeString("foobar"), 1); assertEquals(1, result.size()); IntsRef scratch = new IntsRef(); Util.toUTF32("foobar".toCharArray(), 0, 6, scratch); diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestUTF32ToUTF8.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestUTF32ToUTF8.java index d5faa4dfef90..95b19e049566 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestUTF32ToUTF8.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestUTF32ToUTF8.java @@ -17,13 +17,17 @@ * limitations under the License. */ +import java.nio.charset.StandardCharsets; +import java.util.HashSet; +import java.util.Random; +import java.util.Set; + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.TestUtil; -import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.UnicodeUtil; - -import java.nio.charset.StandardCharsets; -import java.util.Random; +import org.apache.lucene.util.fst.Util; public class TestUTF32ToUTF8 extends LuceneTestCase { @@ -151,12 +155,7 @@ public void testRandomRanges() throws Exception { continue; } - final Automaton a = new Automaton(); - final State end = new State(); - end.setAccept(true); - a.getInitialState().addTransition(new Transition(startCode, endCode, end)); - a.setDeterministic(true); - + Automaton a = Automata.makeCharRange(startCode, endCode); testOne(r, new ByteRunAutomaton(a), startCode, endCode, ITERS_PER_DFA); } } @@ -208,6 +207,20 @@ public void testRandomRegexes() throws Exception { assertAutomaton(new RegExp(AutomatonTestUtil.randomRegexp(random()), RegExp.NONE).toAutomaton()); } } + + public void testSingleton() throws Exception { + int iters = atLeast(100); + for(int iter=0;iter set = new HashSet<>(); + set.add(ints); + assertEquals(set, Operations.getFiniteStrings(utf8, -1)); + } + } private void assertAutomaton(Automaton automaton) throws Exception { CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton); diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java index 26aaeb07d9b2..8c3f60cf74d4 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java @@ -17,6 +17,30 @@ * limitations under the License. */ +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.io.StringWriter; +import java.io.Writer; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Random; +import java.util.Set; +import java.util.TreeMap; +import java.util.TreeSet; +import java.util.concurrent.atomic.AtomicInteger; + import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -40,12 +64,12 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.LineFileDocs; -import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase.Slow; import org.apache.lucene.util.LuceneTestCase.SuppressCodecs; +import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.TestUtil; -import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.CompiledAutomaton; +import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.RegExp; import org.apache.lucene.util.fst.BytesRefFSTEnum.InputOutput; import org.apache.lucene.util.fst.FST.Arc; @@ -54,30 +78,6 @@ import org.apache.lucene.util.fst.Util.Result; import org.apache.lucene.util.packed.PackedInts; -import java.io.BufferedReader; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStreamReader; -import java.io.OutputStreamWriter; -import java.io.StringWriter; -import java.io.Writer; -import java.nio.charset.StandardCharsets; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashSet; -import java.util.List; -import java.util.Locale; -import java.util.Map; -import java.util.Random; -import java.util.Set; -import java.util.TreeMap; -import java.util.TreeSet; -import java.util.concurrent.atomic.AtomicInteger; - import static org.apache.lucene.util.fst.FSTTester.getRandomString; import static org.apache.lucene.util.fst.FSTTester.simpleRandomString; import static org.apache.lucene.util.fst.FSTTester.toIntsRef; @@ -346,7 +346,7 @@ public void testRealTerms() throws Exception { BytesRef term; int ord = 0; - Automaton automaton = new RegExp(".*", RegExp.NONE).toAutomaton(); + Automaton automaton = new RegExp(".*", RegExp.NONE).toAutomaton(); final TermsEnum termsEnum2 = terms.intersect(new CompiledAutomaton(automaton, false, false), null); while((term = termsEnum.next()) != null) { diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/MultiTermHighlighting.java b/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/MultiTermHighlighting.java index a06102578b10..bf2f1d2e85d1 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/MultiTermHighlighting.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/MultiTermHighlighting.java @@ -46,11 +46,11 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.UnicodeUtil; -import org.apache.lucene.util.automaton.Automaton; -import org.apache.lucene.util.automaton.BasicAutomata; -import org.apache.lucene.util.automaton.BasicOperations; +import org.apache.lucene.util.automaton.Automata; +import org.apache.lucene.util.automaton.Operations; import org.apache.lucene.util.automaton.CharacterRunAutomaton; import org.apache.lucene.util.automaton.LevenshteinAutomata; +import org.apache.lucene.util.automaton.Automaton; /** * Support for highlighting multiterm queries in PostingsHighlighter. @@ -106,8 +106,8 @@ public String toString() { final PrefixQuery pq = (PrefixQuery) query; Term prefix = pq.getPrefix(); if (prefix.field().equals(field)) { - list.add(new CharacterRunAutomaton(BasicOperations.concatenate(BasicAutomata.makeString(prefix.text()), - BasicAutomata.makeAnyString())) { + list.add(new CharacterRunAutomaton(Operations.concatenate(Automata.makeString(prefix.text()), + Automata.makeAnyString())) { @Override public String toString() { return pq.toString(); @@ -126,11 +126,8 @@ public String toString() { int prefixLength = Math.min(fq.getPrefixLength(), termLength); String suffix = UnicodeUtil.newString(termText, prefixLength, termText.length - prefixLength); LevenshteinAutomata builder = new LevenshteinAutomata(suffix, fq.getTranspositions()); - Automaton automaton = builder.toAutomaton(fq.getMaxEdits()); - if (prefixLength > 0) { - Automaton prefix = BasicAutomata.makeString(UnicodeUtil.newString(termText, 0, prefixLength)); - automaton = BasicOperations.concatenate(prefix, automaton); - } + String prefix = UnicodeUtil.newString(termText, 0, prefixLength); + Automaton automaton = builder.toAutomaton(fq.getMaxEdits(), prefix); list.add(new CharacterRunAutomaton(automaton) { @Override public String toString() { @@ -161,7 +158,7 @@ public String toString() { final Comparator comparator = CharsRef.getUTF16SortedAsUTF8Comparator(); // this is *not* an automaton, but its very simple - list.add(new CharacterRunAutomaton(BasicAutomata.makeEmpty()) { + list.add(new CharacterRunAutomaton(Automata.makeEmpty()) { @Override public boolean run(char[] s, int offset, int length) { scratch.chars = s; diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java index 30b86f6267de..ee6d8b88b7ec 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java @@ -55,7 +55,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.util.automaton.BasicAutomata; +import org.apache.lucene.util.automaton.Automata; import org.apache.lucene.util.automaton.CharacterRunAutomaton; import org.apache.lucene.util.automaton.RegExp; import org.w3c.dom.Element; @@ -1340,7 +1340,7 @@ public void testMaxSizeHighlightTruncates() throws Exception { @Override public void run() throws Exception { String goodWord = "goodtoken"; - CharacterRunAutomaton stopWords = new CharacterRunAutomaton(BasicAutomata.makeString("stoppedtoken")); + CharacterRunAutomaton stopWords = new CharacterRunAutomaton(Automata.makeString("stoppedtoken")); // we disable MockTokenizer checks because we will forcefully limit the // tokenstream and call end() before incrementToken() returns false. final MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords); diff --git a/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java b/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java index 48c33fe42341..acf07b6b5644 100644 --- a/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java +++ b/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java @@ -629,7 +629,7 @@ public void testRandom() throws Exception { } DocsEnum parents = MultiFields.getTermDocsEnum(joinR, null, "isParent", new BytesRef("x")); System.out.println("parent docIDs:"); - while (parents.nextDoc() != parents.NO_MORE_DOCS) { + while (parents.nextDoc() != DocsEnum.NO_MORE_DOCS) { System.out.println(" " + parents.docID()); } } diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/precedence/TestPrecedenceQueryParser.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/precedence/TestPrecedenceQueryParser.java index a758a6ff2a8b..389bd0fc1f7f 100644 --- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/precedence/TestPrecedenceQueryParser.java +++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/precedence/TestPrecedenceQueryParser.java @@ -46,7 +46,7 @@ import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.util.automaton.BasicAutomata; +import org.apache.lucene.util.automaton.Automata; import org.apache.lucene.util.automaton.CharacterRunAutomaton; import org.junit.AfterClass; import org.junit.BeforeClass; @@ -557,7 +557,7 @@ public void testSimpleDAO() throws Exception { } public void testBoost() throws Exception { - CharacterRunAutomaton stopSet = new CharacterRunAutomaton(BasicAutomata.makeString("on")); + CharacterRunAutomaton stopSet = new CharacterRunAutomaton(Automata.makeString("on")); Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet); PrecedenceQueryParser qp = new PrecedenceQueryParser(); diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestQPHelper.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestQPHelper.java index abcf7a772741..5068210f33ae 100644 --- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestQPHelper.java +++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestQPHelper.java @@ -67,7 +67,7 @@ import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.util.automaton.BasicAutomata; +import org.apache.lucene.util.automaton.Automata; import org.apache.lucene.util.automaton.CharacterRunAutomaton; import org.apache.lucene.util.automaton.RegExp; import org.junit.AfterClass; @@ -957,7 +957,7 @@ public void testSimpleDAO() throws Exception { } public void testBoost() throws Exception { - CharacterRunAutomaton stopSet = new CharacterRunAutomaton(BasicAutomata.makeString("on")); + CharacterRunAutomaton stopSet = new CharacterRunAutomaton(Automata.makeString("on")); Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet); StandardQueryParser qp = new StandardQueryParser(); qp.setAnalyzer(oneStopAnalyzer); diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java index 613a63d804ef..f76c9e1a126c 100644 --- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java +++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java @@ -47,7 +47,7 @@ import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.util.automaton.BasicAutomata; +import org.apache.lucene.util.automaton.Automata; import org.apache.lucene.util.automaton.CharacterRunAutomaton; import org.apache.lucene.util.automaton.RegExp; import org.junit.AfterClass; @@ -868,7 +868,7 @@ public void testSimpleDAO() public void testBoost() throws Exception { - CharacterRunAutomaton stopWords = new CharacterRunAutomaton(BasicAutomata.makeString("on")); + CharacterRunAutomaton stopWords = new CharacterRunAutomaton(Automata.makeString("on")); Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords); CommonQueryParserConfiguration qp = getParserConfig(oneStopAnalyzer); Query q = getQuery("on^1.0",qp); diff --git a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsReader.java b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsReader.java index b4e69e17fde3..3919b4540328 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsReader.java @@ -18,45 +18,25 @@ */ import java.io.IOException; -import java.io.PrintStream; import java.util.Collections; import java.util.Iterator; import java.util.TreeMap; -import org.apache.lucene.codecs.BlockTermState; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.FieldsProducer; import org.apache.lucene.codecs.PostingsReaderBase; import org.apache.lucene.index.CorruptIndexException; -import org.apache.lucene.index.DocsAndPositionsEnum; -import org.apache.lucene.index.DocsEnum; -import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentInfo; -import org.apache.lucene.index.TermState; import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; -import org.apache.lucene.util.RamUsageEstimator; -import org.apache.lucene.util.StringHelper; -import org.apache.lucene.util.automaton.CompiledAutomaton; -import org.apache.lucene.util.automaton.RunAutomaton; -import org.apache.lucene.util.automaton.Transition; -import org.apache.lucene.util.fst.ByteSequenceOutputs; -import org.apache.lucene.util.fst.FST; -import org.apache.lucene.util.fst.Outputs; import org.apache.lucene.util.fst.PairOutputs.Pair; -import org.apache.lucene.util.fst.PairOutputs; -import org.apache.lucene.util.fst.Util; /** * See {@link VersionBlockTreeTermsWriter}. diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java index 483cbb8d8793..afae341b6e16 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java @@ -17,6 +17,16 @@ * limitations under the License. */ +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; + import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStreamToAutomaton; @@ -33,30 +43,19 @@ import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.OfflineSorter; import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.automaton.Operations; import org.apache.lucene.util.automaton.Automaton; -import org.apache.lucene.util.automaton.BasicOperations; -import org.apache.lucene.util.automaton.SpecialOperations; -import org.apache.lucene.util.automaton.State; import org.apache.lucene.util.automaton.Transition; import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.ByteSequenceOutputs; -import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST.BytesReader; -import org.apache.lucene.util.fst.PairOutputs; +import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.PairOutputs.Pair; +import org.apache.lucene.util.fst.PairOutputs; import org.apache.lucene.util.fst.PositiveIntOutputs; -import org.apache.lucene.util.fst.Util; import org.apache.lucene.util.fst.Util.Result; import org.apache.lucene.util.fst.Util.TopResults; - -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashSet; -import java.util.List; -import java.util.Set; +import org.apache.lucene.util.fst.Util; /** * Suggester that first analyzes the surface form, adds the @@ -255,37 +254,64 @@ public long ramBytesUsed() { return fst == null ? 0 : fst.ramBytesUsed(); } - private void copyDestTransitions(State from, State to, List transitions) { - if (to.isAccept()) { - from.setAccept(true); - } - for(Transition t : to.getTransitions()) { - transitions.add(t); + private int[] topoSortStates(Automaton a) { + int[] states = new int[a.getNumStates()]; + final Set visited = new HashSet<>(); + final LinkedList worklist = new LinkedList<>(); + worklist.add(0); + visited.add(0); + int upto = 0; + states[upto] = 0; + upto++; + Transition t = new Transition(); + while (worklist.size() > 0) { + int s = worklist.removeFirst(); + int count = a.initTransition(s, t); + for (int i=0;i=0;stateNumber--) { - final State state = states[stateNumber]; - List newTransitions = new ArrayList<>(); - for(Transition t : state.getTransitions()) { - assert t.getMin() == t.getMax(); - if (t.getMin() == TokenStreamToAutomaton.POS_SEP) { + Transition t = new Transition(); + int[] topoSortStates = topoSortStates(a); + for(int i=0;i lookup(final CharSequence key, Set contexts, } final BytesRef utf8Key = new BytesRef(key); try { - Automaton lookupAutomaton = toLookupAutomaton(key); final CharsRef spare = new CharsRef(); @@ -835,7 +862,7 @@ final Set toFiniteStrings(final BytesRef surfaceForm, final TokenStream automaton = ts2a.toAutomaton(ts); } - replaceSep(automaton); + automaton = replaceSep(automaton); automaton = convertAutomaton(automaton); // TODO: LUCENE-5660 re-enable this once we disallow massive suggestion strings @@ -848,7 +875,8 @@ final Set toFiniteStrings(final BytesRef surfaceForm, final TokenStream // TODO: we could walk & add simultaneously, so we // don't have to alloc [possibly biggish] // intermediate HashSet in RAM: - return SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions); + + return Operations.getFiniteStrings(automaton, maxGraphExpansions); } final Automaton toLookupAutomaton(final CharSequence key) throws IOException { @@ -856,24 +884,16 @@ final Automaton toLookupAutomaton(final CharSequence key) throws IOException { // Turn tokenstream into automaton: Automaton automaton = null; try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) { - automaton = (getTokenStreamToAutomaton()).toAutomaton(ts); + automaton = getTokenStreamToAutomaton().toAutomaton(ts); } - // TODO: we could use the end offset to "guess" - // whether the final token was a partial token; this - // would only be a heuristic ... but maybe an OK one. - // This way we could eg differentiate "net" from "net ", - // which we can't today... - - replaceSep(automaton); + automaton = replaceSep(automaton); // TODO: we can optimize this somewhat by determinizing // while we convert - BasicOperations.determinize(automaton); + automaton = Operations.determinize(automaton); return automaton; } - - /** * Returns the weight associated with an input string, diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java index b9e886f42245..ef6ea6034ca7 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java @@ -17,13 +17,12 @@ * limitations under the License. */ +import java.io.IOException; import java.util.ArrayList; import java.util.List; -import java.io.IOException; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.automaton.Automaton; -import org.apache.lucene.util.automaton.State; import org.apache.lucene.util.automaton.Transition; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.Util; @@ -43,7 +42,7 @@ private FSTUtil() { public static final class Path { /** Node in the automaton where path ends: */ - public final State state; + public final int state; /** Node in the FST where path ends: */ public final FST.Arc fstNode; @@ -55,7 +54,7 @@ public static final class Path { public final IntsRef input; /** Sole constructor. */ - public Path(State state, FST.Arc fstNode, T output, IntsRef input) { + public Path(int state, FST.Arc fstNode, T output, IntsRef input) { this.state = state; this.fstNode = fstNode; this.output = output; @@ -72,16 +71,22 @@ public static List> intersectPrefixPaths(Automaton a, FST fst) assert a.isDeterministic(); final List> queue = new ArrayList<>(); final List> endNodes = new ArrayList<>(); - queue.add(new Path<>(a.getInitialState(), fst + if (a.getNumStates() == 0) { + return endNodes; + } + + queue.add(new Path<>(0, fst .getFirstArc(new FST.Arc()), fst.outputs.getNoOutput(), new IntsRef())); final FST.Arc scratchArc = new FST.Arc<>(); final FST.BytesReader fstReader = fst.getBytesReader(); - + + Transition t = new Transition(); + while (queue.size() != 0) { final Path path = queue.remove(queue.size() - 1); - if (path.state.isAccept()) { + if (a.isAccept(path.state)) { endNodes.add(path); // we can stop here if we accept this path, // we accept all further paths too @@ -89,18 +94,20 @@ public static List> intersectPrefixPaths(Automaton a, FST fst) } IntsRef currentInput = path.input; - for (Transition t : path.state.getTransitions()) { - final int min = t.getMin(); - final int max = t.getMax(); + int count = a.initTransition(path.state, t); + for (int i=0;i nextArc = fst.findTargetArc(t.getMin(), + final FST.Arc nextArc = fst.findTargetArc(t.min, path.fstNode, scratchArc, fstReader); if (nextArc != null) { final IntsRef newInput = new IntsRef(currentInput.length + 1); newInput.copyInts(currentInput); - newInput.ints[currentInput.length] = t.getMin(); + newInput.ints[currentInput.length] = t.min; newInput.length = currentInput.length + 1; - queue.add(new Path<>(t.getDest(), new FST.Arc() + queue.add(new Path<>(t.dest, new FST.Arc() .copyFrom(nextArc), fst.outputs .add(path.output, nextArc.output), newInput)); } @@ -122,7 +129,7 @@ public static List> intersectPrefixPaths(Automaton a, FST fst) newInput.copyInts(currentInput); newInput.ints[currentInput.length] = nextArc.label; newInput.length = currentInput.length + 1; - queue.add(new Path<>(t.getDest(), new FST.Arc() + queue.add(new Path<>(t.dest, new FST.Arc() .copyFrom(nextArc), fst.outputs .add(path.output, nextArc.output), newInput)); final int label = nextArc.label; // used in assert diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java index 458d59d5ba91..c4f91aab9767 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java @@ -28,11 +28,11 @@ import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; // javadocs import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IntsRef; -import org.apache.lucene.util.automaton.Automaton; -import org.apache.lucene.util.automaton.BasicAutomata; -import org.apache.lucene.util.automaton.BasicOperations; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.automaton.Automata; +import org.apache.lucene.util.automaton.Operations; import org.apache.lucene.util.automaton.LevenshteinAutomata; -import org.apache.lucene.util.automaton.SpecialOperations; +import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.UTF32ToUTF8; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.PairOutputs.Pair; @@ -205,7 +205,7 @@ protected List>> getFullPrefixPaths(List ref = SpecialOperations.getFiniteStrings(automaton, -1); + final Set ref = Operations.getFiniteStrings(automaton, -1); Automaton subs[] = new Automaton[ref.size()]; int upto = 0; for (IntsRef path : ref) { if (path.length <= nonFuzzyPrefix || path.length < minFuzzyLength) { - subs[upto] = BasicAutomata.makeString(path.ints, path.offset, path.length); + subs[upto] = Automata.makeString(path.ints, path.offset, path.length); upto++; } else { - Automaton prefix = BasicAutomata.makeString(path.ints, path.offset, nonFuzzyPrefix); int ints[] = new int[path.length-nonFuzzyPrefix]; System.arraycopy(path.ints, path.offset+nonFuzzyPrefix, ints, 0, ints.length); // TODO: maybe add alphaMin to LevenshteinAutomata, @@ -237,29 +236,24 @@ Automaton toLevenshteinAutomata(Automaton automaton) { // edited... but then 0 byte is "in general" allowed // on input (but not in UTF8). LevenshteinAutomata lev = new LevenshteinAutomata(ints, unicodeAware ? Character.MAX_CODE_POINT : 255, transpositions); - Automaton levAutomaton = lev.toAutomaton(maxEdits); - Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton)); - combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already - subs[upto] = combined; + subs[upto] = lev.toAutomaton(maxEdits, UnicodeUtil.newString(path.ints, path.offset, nonFuzzyPrefix)); upto++; } } if (subs.length == 0) { // automaton is empty, there is no accepted paths through it - return BasicAutomata.makeEmpty(); // matches nothing + return Automata.makeEmpty(); // matches nothing } else if (subs.length == 1) { // no synonyms or anything: just a single path through the tokenstream return subs[0]; } else { // multiple paths: this is really scary! is it slow? // maybe we should not do this and throw UOE? - Automaton a = BasicOperations.union(Arrays.asList(subs)); + Automaton a = Operations.union(Arrays.asList(subs)); // TODO: we could call toLevenshteinAutomata() before det? // this only happens if you have multiple paths anyway (e.g. synonyms) - BasicOperations.determinize(a); - - return a; + return Operations.determinize(a); } } } diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/InputArrayIterator.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/InputArrayIterator.java index 75301e92294a..937fb9ea2c87 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/InputArrayIterator.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/InputArrayIterator.java @@ -92,4 +92,4 @@ public Set contexts() { public boolean hasContexts() { return hasContexts; } -} \ No newline at end of file +} diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java index 6a0a58c2e72c..085905542e65 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java @@ -40,15 +40,14 @@ import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.search.suggest.Lookup.LookupResult; import org.apache.lucene.search.suggest.Input; import org.apache.lucene.search.suggest.InputArrayIterator; +import org.apache.lucene.search.suggest.Lookup.LookupResult; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.TestUtil; import org.apache.lucene.util.automaton.Automaton; -import org.apache.lucene.util.automaton.State; import org.apache.lucene.util.fst.Util; public class FuzzySuggesterTest extends LuceneTestCase { @@ -754,27 +753,28 @@ public void testRandom() throws Exception { // this: Automaton automaton = suggester.convertAutomaton(suggester.toLevenshteinAutomata(suggester.toLookupAutomaton(analyzedKey))); assertTrue(automaton.isDeterministic()); + // TODO: could be faster... but its slowCompletor for a reason BytesRef spare = new BytesRef(); for (TermFreqPayload2 e : slowCompletor) { spare.copyChars(e.analyzedForm); Set finiteStrings = suggester.toFiniteStrings(spare, tokenStreamToAutomaton); for (IntsRef intsRef : finiteStrings) { - State p = automaton.getInitialState(); + int p = 0; BytesRef ref = Util.toBytesRef(intsRef, spare); boolean added = false; for (int i = ref.offset; i < ref.length; i++) { - State q = p.step(ref.bytes[i] & 0xff); - if (q == null) { + int q = automaton.step(p, ref.bytes[i] & 0xff); + if (q == -1) { break; - } else if (q.isAccept()) { + } else if (automaton.isAccept(q)) { matches.add(new LookupResult(e.surfaceForm, e.weight)); added = true; break; } p = q; } - if (!added && p.isAccept()) { + if (!added && automaton.isAccept(p)) { matches.add(new LookupResult(e.surfaceForm, e.weight)); } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenFilter.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenFilter.java index 0aea06c0d773..25619a8c770b 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenFilter.java +++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenFilter.java @@ -17,15 +17,15 @@ * limitations under the License. */ -import static org.apache.lucene.util.automaton.BasicAutomata.makeEmpty; -import static org.apache.lucene.util.automaton.BasicAutomata.makeString; +import static org.apache.lucene.util.automaton.Automata.makeEmpty; +import static org.apache.lucene.util.automaton.Automata.makeString; import java.io.IOException; import java.util.Arrays; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.util.automaton.BasicOperations; +import org.apache.lucene.util.automaton.Operations; import org.apache.lucene.util.automaton.CharacterRunAutomaton; /** @@ -43,7 +43,7 @@ public final class MockTokenFilter extends TokenFilter { /** Set of common english stopwords */ public static final CharacterRunAutomaton ENGLISH_STOPSET = - new CharacterRunAutomaton(BasicOperations.union(Arrays.asList( + new CharacterRunAutomaton(Operations.union(Arrays.asList( makeString("a"), makeString("an"), makeString("and"), makeString("are"), makeString("as"), makeString("at"), makeString("be"), makeString("but"), makeString("by"), makeString("for"), makeString("if"), makeString("in"), diff --git a/lucene/test-framework/src/java/org/apache/lucene/search/SearchEquivalenceTestBase.java b/lucene/test-framework/src/java/org/apache/lucene/search/SearchEquivalenceTestBase.java index 2ce1e1c44368..a47dd3dd699a 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/search/SearchEquivalenceTestBase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/search/SearchEquivalenceTestBase.java @@ -33,7 +33,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.TestUtil; -import org.apache.lucene.util.automaton.BasicAutomata; +import org.apache.lucene.util.automaton.Automata; import org.apache.lucene.util.automaton.CharacterRunAutomaton; import org.junit.AfterClass; import org.junit.BeforeClass; @@ -57,7 +57,7 @@ public static void beforeClass() throws Exception { Random random = random(); directory = newDirectory(); stopword = "" + randomChar(); - CharacterRunAutomaton stopset = new CharacterRunAutomaton(BasicAutomata.makeString(stopword)); + CharacterRunAutomaton stopset = new CharacterRunAutomaton(Automata.makeString(stopword)); analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false, stopset); RandomIndexWriter iw = new RandomIndexWriter(random, directory, analyzer); Document doc = new Document(); diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/automaton/AutomatonTestUtil.java b/lucene/test-framework/src/java/org/apache/lucene/util/automaton/AutomatonTestUtil.java index f34183c1d346..41d7c5134465 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/automaton/AutomatonTestUtil.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/automaton/AutomatonTestUtil.java @@ -20,7 +20,6 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; -import java.util.IdentityHashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; @@ -31,7 +30,6 @@ import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.TestUtil; import org.apache.lucene.util.UnicodeUtil; -import org.apache.lucene.util.fst.Util; /** * Utilities for testing automata. @@ -92,40 +90,40 @@ private static String randomRegexpString(Random r) { /** picks a random int code point, avoiding surrogates; * throws IllegalArgumentException if this transition only * accepts surrogates */ - private static int getRandomCodePoint(final Random r, final Transition t) { + private static int getRandomCodePoint(final Random r, int min, int max) { final int code; - if (t.max < UnicodeUtil.UNI_SUR_HIGH_START || - t.min > UnicodeUtil.UNI_SUR_HIGH_END) { + if (max < UnicodeUtil.UNI_SUR_HIGH_START || + min > UnicodeUtil.UNI_SUR_HIGH_END) { // easy: entire range is before or after surrogates - code = t.min+r.nextInt(t.max-t.min+1); - } else if (t.min >= UnicodeUtil.UNI_SUR_HIGH_START) { - if (t.max > UnicodeUtil.UNI_SUR_LOW_END) { + code = min+r.nextInt(max-min+1); + } else if (min >= UnicodeUtil.UNI_SUR_HIGH_START) { + if (max > UnicodeUtil.UNI_SUR_LOW_END) { // after surrogates - code = 1+UnicodeUtil.UNI_SUR_LOW_END+r.nextInt(t.max-UnicodeUtil.UNI_SUR_LOW_END); + code = 1+UnicodeUtil.UNI_SUR_LOW_END+r.nextInt(max-UnicodeUtil.UNI_SUR_LOW_END); } else { - throw new IllegalArgumentException("transition accepts only surrogates: " + t); + throw new IllegalArgumentException("transition accepts only surrogates: min=" + min + " max=" + max); } - } else if (t.max <= UnicodeUtil.UNI_SUR_LOW_END) { - if (t.min < UnicodeUtil.UNI_SUR_HIGH_START) { + } else if (max <= UnicodeUtil.UNI_SUR_LOW_END) { + if (min < UnicodeUtil.UNI_SUR_HIGH_START) { // before surrogates - code = t.min + r.nextInt(UnicodeUtil.UNI_SUR_HIGH_START - t.min); + code = min + r.nextInt(UnicodeUtil.UNI_SUR_HIGH_START - min); } else { - throw new IllegalArgumentException("transition accepts only surrogates: " + t); + throw new IllegalArgumentException("transition accepts only surrogates: min=" + min + " max=" + max); } } else { // range includes all surrogates - int gap1 = UnicodeUtil.UNI_SUR_HIGH_START - t.min; - int gap2 = t.max - UnicodeUtil.UNI_SUR_LOW_END; + int gap1 = UnicodeUtil.UNI_SUR_HIGH_START - min; + int gap2 = max - UnicodeUtil.UNI_SUR_LOW_END; int c = r.nextInt(gap1+gap2); if (c < gap1) { - code = t.min + c; + code = min + c; } else { code = UnicodeUtil.UNI_SUR_LOW_END + c - gap1 + 1; } } - assert code >= t.min && code <= t.max && (code < UnicodeUtil.UNI_SUR_HIGH_START || code > UnicodeUtil.UNI_SUR_LOW_END): - "code=" + code + " min=" + t.min + " max=" + t.max; + assert code >= min && code <= max && (code < UnicodeUtil.UNI_SUR_HIGH_START || code > UnicodeUtil.UNI_SUR_LOW_END): + "code=" + code + " min=" + min + " max=" + max; return code; } @@ -140,11 +138,13 @@ public static class RandomAcceptedStrings { private final Map leadsToAccept; private final Automaton a; + private final Transition[][] transitions; private static class ArrivingTransition { - final State from; + final int from; final Transition t; - public ArrivingTransition(State from, Transition t) { + + public ArrivingTransition(int from, Transition t) { this.from = from; this.t = t; } @@ -152,32 +152,30 @@ public ArrivingTransition(State from, Transition t) { public RandomAcceptedStrings(Automaton a) { this.a = a; - if (a.isSingleton()) { - leadsToAccept = null; - return; + if (a.getNumStates() == 0) { + throw new IllegalArgumentException("this automaton accepts nothing"); } + this.transitions = a.getSortedTransitions(); - // must use IdentityHashmap because two Transitions w/ - // different start nodes can be considered the same - leadsToAccept = new IdentityHashMap<>(); - final Map> allArriving = new HashMap<>(); + leadsToAccept = new HashMap<>(); + final Map> allArriving = new HashMap<>(); - final LinkedList q = new LinkedList<>(); - final Set seen = new HashSet<>(); + final LinkedList q = new LinkedList<>(); + final Set seen = new HashSet<>(); // reverse map the transitions, so we can quickly look // up all arriving transitions to a given state - for(State s: a.getNumberedStates()) { - for(int i=0;i tl = allArriving.get(t.to); + int numStates = a.getNumStates(); + for(int s=0;s tl = allArriving.get(t.dest); if (tl == null) { tl = new ArrayList<>(); - allArriving.put(t.to, tl); + allArriving.put(t.dest, tl); } tl.add(new ArrivingTransition(s, t)); } - if (s.accept) { + if (a.isAccept(s)) { q.add(s); seen.add(s); } @@ -185,12 +183,12 @@ public RandomAcceptedStrings(Automaton a) { // Breadth-first search, from accept states, // backwards: - while(!q.isEmpty()) { - final State s = q.removeFirst(); + while (q.isEmpty() == false) { + final int s = q.removeFirst(); List arriving = allArriving.get(s); if (arriving != null) { for(ArrivingTransition at : arriving) { - final State from = at.from; + final int from = at.from; if (!seen.contains(from)) { q.add(from); seen.add(from); @@ -204,62 +202,49 @@ public RandomAcceptedStrings(Automaton a) { public int[] getRandomAcceptedString(Random r) { final List soFar = new ArrayList<>(); - if (a.isSingleton()) { - // accepts only one - final String s = a.singleton; - - int charUpto = 0; - while(charUpto < s.length()) { - final int cp = s.codePointAt(charUpto); - charUpto += Character.charCount(cp); - soFar.add(cp); - } - } else { - State s = a.initial; + int s = 0; - while(true) { + while(true) { - if (s.accept) { - if (s.numTransitions == 0) { - // stop now + if (a.isAccept(s)) { + if (a.getNumTransitions(s) == 0) { + // stop now + break; + } else { + if (r.nextBoolean()) { break; - } else { - if (r.nextBoolean()) { - break; - } } } + } - if (s.numTransitions == 0) { - throw new RuntimeException("this automaton has dead states"); - } + if (a.getNumTransitions(s) == 0) { + throw new RuntimeException("this automaton has dead states"); + } - boolean cheat = r.nextBoolean(); - - final Transition t; - if (cheat) { - // pick a transition that we know is the fastest - // path to an accept state - List toAccept = new ArrayList<>(); - for(int i=0;i toAccept = new ArrayList<>(); + for(Transition t0 : transitions[s]) { + if (leadsToAccept.containsKey(t0)) { + toAccept.add(t0); } + } + if (toAccept.size() == 0) { + // this is OK -- it means we jumped into a cycle + t = transitions[s][r.nextInt(transitions[s].length)]; } else { - t = s.transitionsArray[r.nextInt(s.numTransitions)]; + t = toAccept.get(r.nextInt(toAccept.size())); } - soFar.add(getRandomCodePoint(r, t)); - s = t.to; + } else { + t = transitions[s][r.nextInt(transitions[s].length)]; } + soFar.add(getRandomCodePoint(r, t.min, t.max)); + s = t.dest; } return ArrayUtil.toIntArray(soFar); @@ -270,19 +255,21 @@ public int[] getRandomAcceptedString(Random r) { public static Automaton randomAutomaton(Random random) { // get two random Automata from regexps Automaton a1 = new RegExp(AutomatonTestUtil.randomRegexp(random), RegExp.NONE).toAutomaton(); - if (random.nextBoolean()) - a1 = BasicOperations.complement(a1); + if (random.nextBoolean()) { + a1 = Operations.complement(a1); + } Automaton a2 = new RegExp(AutomatonTestUtil.randomRegexp(random), RegExp.NONE).toAutomaton(); - if (random.nextBoolean()) - a2 = BasicOperations.complement(a2); - + if (random.nextBoolean()) { + a2 = Operations.complement(a2); + } + // combine them in random ways - switch(random.nextInt(4)) { - case 0: return BasicOperations.concatenate(a1, a2); - case 1: return BasicOperations.union(a1, a2); - case 2: return BasicOperations.intersection(a1, a2); - default: return BasicOperations.minus(a1, a2); + switch (random.nextInt(4)) { + case 0: return Operations.concatenate(a1, a2); + case 1: return Operations.union(a1, a2); + case 2: return Operations.intersection(a1, a2); + default: return Operations.minus(a1, a2); } } @@ -323,70 +310,81 @@ public static Automaton randomAutomaton(Random random) { /** * Simple, original brics implementation of Brzozowski minimize() */ - public static void minimizeSimple(Automaton a) { - if (a.isSingleton()) - return; - determinizeSimple(a, SpecialOperations.reverse(a)); - determinizeSimple(a, SpecialOperations.reverse(a)); + public static Automaton minimizeSimple(Automaton a) { + Set initialSet = new HashSet(); + a = determinizeSimple(Operations.reverse(a, initialSet), initialSet); + initialSet.clear(); + a = determinizeSimple(Operations.reverse(a, initialSet), initialSet); + return a; } /** * Simple, original brics implementation of determinize() */ - public static void determinizeSimple(Automaton a) { - if (a.deterministic || a.isSingleton()) - return; - Set initialset = new HashSet<>(); - initialset.add(a.initial); - determinizeSimple(a, initialset); + public static Automaton determinizeSimple(Automaton a) { + Set initialset = new HashSet<>(); + initialset.add(0); + return determinizeSimple(a, initialset); } - + /** * Simple, original brics implementation of determinize() * Determinizes the given automaton using the given set of initial states. */ - public static void determinizeSimple(Automaton a, Set initialset) { + public static Automaton determinizeSimple(Automaton a, Set initialset) { + if (a.getNumStates() == 0) { + return a; + } int[] points = a.getStartPoints(); // subset construction - Map, Set> sets = new HashMap<>(); - LinkedList> worklist = new LinkedList<>(); - Map, State> newstate = new HashMap<>(); + Map, Set> sets = new HashMap<>(); + LinkedList> worklist = new LinkedList<>(); + Map, Integer> newstate = new HashMap<>(); sets.put(initialset, initialset); worklist.add(initialset); - a.initial = new State(); - newstate.put(initialset, a.initial); + Automaton.Builder result = new Automaton.Builder(); + result.createState(); + newstate.put(initialset, 0); + Transition t = new Transition(); while (worklist.size() > 0) { - Set s = worklist.removeFirst(); - State r = newstate.get(s); - for (State q : s) - if (q.accept) { - r.accept = true; + Set s = worklist.removeFirst(); + int r = newstate.get(s); + for (int q : s) { + if (a.isAccept(q)) { + result.setAccept(r, true); break; } + } for (int n = 0; n < points.length; n++) { - Set p = new HashSet<>(); - for (State q : s) - for (Transition t : q.getTransitions()) - if (t.min <= points[n] && points[n] <= t.max) - p.add(t.to); + Set p = new HashSet<>(); + for (int q : s) { + int count = a.initTransition(q, t); + for(int i=0;i initialset) { */ public static Set getFiniteStringsRecursive(Automaton a, int limit) { HashSet strings = new HashSet<>(); - if (a.isSingleton()) { - if (limit > 0) { - strings.add(Util.toUTF32(a.singleton, new IntsRef())); - } - } else if (!getFiniteStrings(a.initial, new HashSet(), strings, new IntsRef(), limit)) { + if (!getFiniteStrings(a, 0, new HashSet(), strings, new IntsRef(), limit)) { return strings; } return strings; @@ -418,24 +412,27 @@ public static Set getFiniteStringsRecursive(Automaton a, int limit) { * false if more than limit strings are found. * limit<0 means "infinite". */ - private static boolean getFiniteStrings(State s, HashSet pathstates, + private static boolean getFiniteStrings(Automaton a, int s, HashSet pathstates, HashSet strings, IntsRef path, int limit) { pathstates.add(s); - for (Transition t : s.getTransitions()) { - if (pathstates.contains(t.to)) { + Transition t = new Transition(); + int count = a.initTransition(s, t); + for (int i=0;i= 0 && strings.size() > limit) { return false; } } - if (!getFiniteStrings(t.to, pathstates, strings, path, limit)) { + if (!getFiniteStrings(a, t.dest, pathstates, strings, path, limit)) { return false; } path.length--; @@ -452,8 +449,10 @@ private static boolean getFiniteStrings(State s, HashSet pathstates, * this is only used to test the correctness of our faster implementation. */ public static boolean isFiniteSlow(Automaton a) { - if (a.isSingleton()) return true; - return isFiniteSlow(a.initial, new HashSet()); + if (a.getNumStates() == 0) { + return true; + } + return isFiniteSlow(a, 0, new HashSet()); } /** @@ -462,22 +461,48 @@ public static boolean isFiniteSlow(Automaton a) { */ // TODO: not great that this is recursive... in theory a // large automata could exceed java's stack - private static boolean isFiniteSlow(State s, HashSet path) { + private static boolean isFiniteSlow(Automaton a, int s, HashSet path) { path.add(s); - for (Transition t : s.getTransitions()) - if (path.contains(t.to) || !isFiniteSlow(t.to, path)) return false; + Transition t = new Transition(); + int count = a.initTransition(s, t); + for (int i=0;i 0 && prefix.charAt(0) == '\u0001'; } diff --git a/solr/test-framework/src/java/org/apache/solr/analysis/MockTokenFilterFactory.java b/solr/test-framework/src/java/org/apache/solr/analysis/MockTokenFilterFactory.java index f7a5183ab421..56d7c57f776d 100644 --- a/solr/test-framework/src/java/org/apache/solr/analysis/MockTokenFilterFactory.java +++ b/solr/test-framework/src/java/org/apache/solr/analysis/MockTokenFilterFactory.java @@ -62,4 +62,4 @@ public MockTokenFilterFactory(Map args) { public MockTokenFilter create(TokenStream stream) { return new MockTokenFilter(stream, filter); } -} \ No newline at end of file +}