ns = aa.getAcceptStates();
- for (State s : ac) {
- s.accept = false;
- s.addEpsilon(aa.initial);
- if (s.accept) ns.add(s);
- }
- ac = ns;
- }
- b.deterministic = false;
- //b.clearHashCode();
- b.clearNumberedStates();
- b.checkMinimizeAlways();
- return b;
- }
- }
-
- /**
- * Returns an automaton that accepts the union of the empty string and the
- * language of the given automaton.
- *
- * Complexity: linear in number of states.
- */
- static public Automaton optional(Automaton a) {
- a = a.cloneExpandedIfRequired();
- State s = new State();
- s.addEpsilon(a.initial);
- s.accept = true;
- a.initial = s;
- a.deterministic = false;
- //a.clearHashCode();
- a.clearNumberedStates();
- a.checkMinimizeAlways();
- return a;
- }
-
- /**
- * Returns an automaton that accepts the Kleene star (zero or more
- * concatenated repetitions) of the language of the given automaton. Never
- * modifies the input automaton language.
- *
- * Complexity: linear in number of states.
- */
- static public Automaton repeat(Automaton a) {
- a = a.cloneExpanded();
- State s = new State();
- s.accept = true;
- s.addEpsilon(a.initial);
- for (State p : a.getAcceptStates())
- p.addEpsilon(s);
- a.initial = s;
- a.deterministic = false;
- //a.clearHashCode();
- a.clearNumberedStates();
- a.checkMinimizeAlways();
- return a;
- }
-
- /**
- * Returns an automaton that accepts min
or more concatenated
- * repetitions of the language of the given automaton.
- *
- * Complexity: linear in number of states and in min
.
- */
- static public Automaton repeat(Automaton a, int min) {
- if (min == 0) return repeat(a);
- List as = new ArrayList<>();
- while (min-- > 0)
- as.add(a);
- as.add(repeat(a));
- return concatenate(as);
- }
-
- /**
- * Returns an automaton that accepts between min
and
- * max
(including both) concatenated repetitions of the language
- * of the given automaton.
- *
- * Complexity: linear in number of states and in min
and
- * max
.
- */
- static public Automaton repeat(Automaton a, int min, int max) {
- if (min > max) return BasicAutomata.makeEmpty();
- max -= min;
- a.expandSingleton();
- Automaton b;
- if (min == 0) b = BasicAutomata.makeEmptyString();
- else if (min == 1) b = a.clone();
- else {
- List as = new ArrayList<>();
- while (min-- > 0)
- as.add(a);
- b = concatenate(as);
- }
- if (max > 0) {
- Automaton d = a.clone();
- while (--max > 0) {
- Automaton c = a.clone();
- for (State p : c.getAcceptStates())
- p.addEpsilon(d.initial);
- d = c;
- }
- for (State p : b.getAcceptStates())
- p.addEpsilon(d.initial);
- b.deterministic = false;
- //b.clearHashCode();
- b.clearNumberedStates();
- b.checkMinimizeAlways();
- }
- return b;
- }
-
- /**
- * Returns a (deterministic) automaton that accepts the complement of the
- * language of the given automaton.
- *
- * Complexity: linear in number of states (if already deterministic).
- */
- static public Automaton complement(Automaton a) {
- a = a.cloneExpandedIfRequired();
- a.determinize();
- a.totalize();
- for (State p : a.getNumberedStates())
- p.accept = !p.accept;
- a.removeDeadTransitions();
- return a;
- }
-
- /**
- * Returns a (deterministic) automaton that accepts the intersection of the
- * language of a1
and the complement of the language of
- * a2
. As a side-effect, the automata may be determinized, if not
- * already deterministic.
- *
- * Complexity: quadratic in number of states (if already deterministic).
- */
- static public Automaton minus(Automaton a1, Automaton a2) {
- if (BasicOperations.isEmpty(a1) || a1 == a2) return BasicAutomata
- .makeEmpty();
- if (BasicOperations.isEmpty(a2)) return a1.cloneIfRequired();
- if (a1.isSingleton()) {
- if (BasicOperations.run(a2, a1.singleton)) return BasicAutomata.makeEmpty();
- else return a1.cloneIfRequired();
- }
- return intersection(a1, a2.complement());
- }
-
- /**
- * Returns an automaton that accepts the intersection of the languages of the
- * given automata. Never modifies the input automata languages.
- *
- * Complexity: quadratic in number of states.
- */
- static public Automaton intersection(Automaton a1, Automaton a2) {
- if (a1.isSingleton()) {
- if (BasicOperations.run(a2, a1.singleton)) return a1.cloneIfRequired();
- else return BasicAutomata.makeEmpty();
- }
- if (a2.isSingleton()) {
- if (BasicOperations.run(a1, a2.singleton)) return a2.cloneIfRequired();
- else return BasicAutomata.makeEmpty();
- }
- if (a1 == a2) return a1.cloneIfRequired();
- Transition[][] transitions1 = a1.getSortedTransitions();
- Transition[][] transitions2 = a2.getSortedTransitions();
- Automaton c = new Automaton();
- LinkedList worklist = new LinkedList<>();
- HashMap newstates = new HashMap<>();
- StatePair p = new StatePair(c.initial, a1.initial, a2.initial);
- worklist.add(p);
- newstates.put(p, p);
- while (worklist.size() > 0) {
- p = worklist.removeFirst();
- p.s.accept = p.s1.accept && p.s2.accept;
- Transition[] t1 = transitions1[p.s1.number];
- Transition[] t2 = transitions2[p.s2.number];
- for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) {
- while (b2 < t2.length && t2[b2].max < t1[n1].min)
- b2++;
- for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++)
- if (t2[n2].max >= t1[n1].min) {
- StatePair q = new StatePair(t1[n1].to, t2[n2].to);
- StatePair r = newstates.get(q);
- if (r == null) {
- q.s = new State();
- worklist.add(q);
- newstates.put(q, q);
- r = q;
- }
- int min = t1[n1].min > t2[n2].min ? t1[n1].min : t2[n2].min;
- int max = t1[n1].max < t2[n2].max ? t1[n1].max : t2[n2].max;
- p.s.addTransition(new Transition(min, max, r.s));
- }
- }
- }
- c.deterministic = a1.deterministic && a2.deterministic;
- c.removeDeadTransitions();
- c.checkMinimizeAlways();
- return c;
- }
-
- /** Returns true if these two automata accept exactly the
- * same language. This is a costly computation! Note
- * also that a1 and a2 will be determinized as a side
- * effect. */
- public static boolean sameLanguage(Automaton a1, Automaton a2) {
- if (a1 == a2) {
- return true;
- }
- if (a1.isSingleton() && a2.isSingleton()) {
- return a1.singleton.equals(a2.singleton);
- } else if (a1.isSingleton()) {
- // subsetOf is faster if the first automaton is a singleton
- return subsetOf(a1, a2) && subsetOf(a2, a1);
- } else {
- return subsetOf(a2, a1) && subsetOf(a1, a2);
- }
- }
-
- /**
- * Returns true if the language of a1
is a subset of the language
- * of a2
. As a side-effect, a2
is determinized if
- * not already marked as deterministic.
- *
- * Complexity: quadratic in number of states.
- */
- public static boolean subsetOf(Automaton a1, Automaton a2) {
- if (a1 == a2) return true;
- if (a1.isSingleton()) {
- if (a2.isSingleton()) return a1.singleton.equals(a2.singleton);
- return BasicOperations.run(a2, a1.singleton);
- }
- a2.determinize();
- Transition[][] transitions1 = a1.getSortedTransitions();
- Transition[][] transitions2 = a2.getSortedTransitions();
- LinkedList worklist = new LinkedList<>();
- HashSet visited = new HashSet<>();
- StatePair p = new StatePair(a1.initial, a2.initial);
- worklist.add(p);
- visited.add(p);
- while (worklist.size() > 0) {
- p = worklist.removeFirst();
- if (p.s1.accept && !p.s2.accept) {
- return false;
- }
- Transition[] t1 = transitions1[p.s1.number];
- Transition[] t2 = transitions2[p.s2.number];
- for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) {
- while (b2 < t2.length && t2[b2].max < t1[n1].min)
- b2++;
- int min1 = t1[n1].min, max1 = t1[n1].max;
-
- for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++) {
- if (t2[n2].min > min1) {
- return false;
- }
- if (t2[n2].max < Character.MAX_CODE_POINT) min1 = t2[n2].max + 1;
- else {
- min1 = Character.MAX_CODE_POINT;
- max1 = Character.MIN_CODE_POINT;
- }
- StatePair q = new StatePair(t1[n1].to, t2[n2].to);
- if (!visited.contains(q)) {
- worklist.add(q);
- visited.add(q);
- }
- }
- if (min1 <= max1) {
- return false;
- }
- }
- }
- return true;
- }
-
- /**
- * Returns an automaton that accepts the union of the languages of the given
- * automata.
- *
- * Complexity: linear in number of states.
- */
- public static Automaton union(Automaton a1, Automaton a2) {
- if ((a1.isSingleton() && a2.isSingleton() && a1.singleton
- .equals(a2.singleton))
- || a1 == a2) return a1.cloneIfRequired();
- if (a1 == a2) {
- a1 = a1.cloneExpanded();
- a2 = a2.cloneExpanded();
- } else {
- a1 = a1.cloneExpandedIfRequired();
- a2 = a2.cloneExpandedIfRequired();
- }
- State s = new State();
- s.addEpsilon(a1.initial);
- s.addEpsilon(a2.initial);
- a1.initial = s;
- a1.deterministic = false;
- //a1.clearHashCode();
- a1.clearNumberedStates();
- a1.checkMinimizeAlways();
- return a1;
- }
-
- /**
- * Returns an automaton that accepts the union of the languages of the given
- * automata.
- *
- * Complexity: linear in number of states.
- */
- public static Automaton union(Collection l) {
- Set ids = new HashSet<>();
- for (Automaton a : l)
- ids.add(System.identityHashCode(a));
- boolean has_aliases = ids.size() != l.size();
- State s = new State();
- for (Automaton b : l) {
- if (BasicOperations.isEmpty(b)) continue;
- Automaton bb = b;
- if (has_aliases) bb = bb.cloneExpanded();
- else bb = bb.cloneExpandedIfRequired();
- s.addEpsilon(bb.initial);
- }
- Automaton a = new Automaton();
- a.initial = s;
- a.deterministic = false;
- //a.clearHashCode();
- a.clearNumberedStates();
- a.checkMinimizeAlways();
- return a;
- }
-
- // Simple custom ArrayList
- private final static class TransitionList {
- Transition[] transitions = new Transition[2];
- int count;
-
- public void add(Transition t) {
- if (transitions.length == count) {
- Transition[] newArray = new Transition[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
- System.arraycopy(transitions, 0, newArray, 0, count);
- transitions = newArray;
- }
- transitions[count++] = t;
- }
- }
-
- // Holds all transitions that start on this int point, or
- // end at this point-1
- private final static class PointTransitions implements Comparable {
- int point;
- final TransitionList ends = new TransitionList();
- final TransitionList starts = new TransitionList();
- @Override
- public int compareTo(PointTransitions other) {
- return point - other.point;
- }
-
- public void reset(int point) {
- this.point = point;
- ends.count = 0;
- starts.count = 0;
- }
-
- @Override
- public boolean equals(Object other) {
- return ((PointTransitions) other).point == point;
- }
-
- @Override
- public int hashCode() {
- return point;
- }
- }
-
- private final static class PointTransitionSet {
- int count;
- PointTransitions[] points = new PointTransitions[5];
-
- private final static int HASHMAP_CUTOVER = 30;
- private final HashMap map = new HashMap<>();
- private boolean useHash = false;
-
- private PointTransitions next(int point) {
- // 1st time we are seeing this point
- if (count == points.length) {
- final PointTransitions[] newArray = new PointTransitions[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
- System.arraycopy(points, 0, newArray, 0, count);
- points = newArray;
- }
- PointTransitions points0 = points[count];
- if (points0 == null) {
- points0 = points[count] = new PointTransitions();
- }
- points0.reset(point);
- count++;
- return points0;
- }
-
- private PointTransitions find(int point) {
- if (useHash) {
- final Integer pi = point;
- PointTransitions p = map.get(pi);
- if (p == null) {
- p = next(point);
- map.put(pi, p);
- }
- return p;
- } else {
- for(int i=0;i 1) ArrayUtil.timSort(points, 0, count);
- }
-
- public void add(Transition t) {
- find(t.min).starts.add(t);
- find(1+t.max).ends.add(t);
- }
-
- @Override
- public String toString() {
- StringBuilder s = new StringBuilder();
- for(int i=0;i 0) {
- s.append(' ');
- }
- s.append(points[i].point).append(':').append(points[i].starts.count).append(',').append(points[i].ends.count);
- }
- return s.toString();
- }
- }
-
- /**
- * Determinizes the given automaton.
- *
- * Worst case complexity: exponential in number of states.
- */
- public static void determinize(Automaton a) {
- if (a.deterministic || a.isSingleton()) {
- return;
- }
-
- final State[] allStates = a.getNumberedStates();
-
- // subset construction
- final boolean initAccept = a.initial.accept;
- final int initNumber = a.initial.number;
- a.initial = new State();
- SortedIntSet.FrozenIntSet initialset = new SortedIntSet.FrozenIntSet(initNumber, a.initial);
-
- LinkedList worklist = new LinkedList<>();
- Map newstate = new HashMap<>();
-
- worklist.add(initialset);
-
- a.initial.accept = initAccept;
- newstate.put(initialset, a.initial);
-
- int newStateUpto = 0;
- State[] newStatesArray = new State[5];
- newStatesArray[newStateUpto] = a.initial;
- a.initial.number = newStateUpto;
- newStateUpto++;
-
- // like Set
- final PointTransitionSet points = new PointTransitionSet();
-
- // like SortedMap
- final SortedIntSet statesSet = new SortedIntSet(5);
-
- while (worklist.size() > 0) {
- SortedIntSet.FrozenIntSet s = worklist.removeFirst();
-
- // Collate all outgoing transitions by min/1+max:
- for(int i=0;i 0) {
- assert lastPoint != -1;
-
- statesSet.computeHash();
-
- State q = newstate.get(statesSet);
- if (q == null) {
- q = new State();
- final SortedIntSet.FrozenIntSet p = statesSet.freeze(q);
- worklist.add(p);
- if (newStateUpto == newStatesArray.length) {
- final State[] newArray = new State[ArrayUtil.oversize(1+newStateUpto, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
- System.arraycopy(newStatesArray, 0, newArray, 0, newStateUpto);
- newStatesArray = newArray;
- }
- newStatesArray[newStateUpto] = q;
- q.number = newStateUpto;
- newStateUpto++;
- q.accept = accCount > 0;
- newstate.put(p, q);
- } else {
- assert (accCount > 0 ? true:false) == q.accept: "accCount=" + accCount + " vs existing accept=" + q.accept + " states=" + statesSet;
- }
-
- r.addTransition(new Transition(lastPoint, point-1, q));
- }
-
- // process transitions that end on this point
- // (closes an overlapping interval)
- Transition[] transitions = points.points[i].ends.transitions;
- int limit = points.points[i].ends.count;
- for(int j=0;j pairs) {
- a.expandSingleton();
- HashMap> forward = new HashMap<>();
- HashMap> back = new HashMap<>();
- for (StatePair p : pairs) {
- HashSet to = forward.get(p.s1);
- if (to == null) {
- to = new HashSet<>();
- forward.put(p.s1, to);
- }
- to.add(p.s2);
- HashSet from = back.get(p.s2);
- if (from == null) {
- from = new HashSet<>();
- back.put(p.s2, from);
- }
- from.add(p.s1);
- }
- // calculate epsilon closure
- LinkedList worklist = new LinkedList<>(pairs);
- HashSet workset = new HashSet<>(pairs);
- while (!worklist.isEmpty()) {
- StatePair p = worklist.removeFirst();
- workset.remove(p);
- HashSet to = forward.get(p.s2);
- HashSet from = back.get(p.s1);
- if (to != null) {
- for (State s : to) {
- StatePair pp = new StatePair(p.s1, s);
- if (!pairs.contains(pp)) {
- pairs.add(pp);
- forward.get(p.s1).add(s);
- back.get(s).add(p.s1);
- worklist.add(pp);
- workset.add(pp);
- if (from != null) {
- for (State q : from) {
- StatePair qq = new StatePair(q, p.s1);
- if (!workset.contains(qq)) {
- worklist.add(qq);
- workset.add(qq);
- }
- }
- }
- }
- }
- }
- }
- // add transitions
- for (StatePair p : pairs)
- p.s1.addEpsilon(p.s2);
- a.deterministic = false;
- //a.clearHashCode();
- a.clearNumberedStates();
- a.checkMinimizeAlways();
- }
-
- /**
- * Returns true if the given automaton accepts the empty string and nothing
- * else.
- */
- public static boolean isEmptyString(Automaton a) {
- if (a.isSingleton()) return a.singleton.length() == 0;
- else return a.initial.accept && a.initial.numTransitions() == 0;
- }
-
- /**
- * Returns true if the given automaton accepts no strings.
- */
- public static boolean isEmpty(Automaton a) {
- if (a.isSingleton()) return false;
- return !a.initial.accept && a.initial.numTransitions() == 0;
- }
-
- /**
- * Returns true if the given automaton accepts all strings.
- */
- public static boolean isTotal(Automaton a) {
- if (a.isSingleton()) return false;
- if (a.initial.accept && a.initial.numTransitions() == 1) {
- Transition t = a.initial.getTransitions().iterator().next();
- return t.to == a.initial && t.min == Character.MIN_CODE_POINT
- && t.max == Character.MAX_CODE_POINT;
- }
- return false;
- }
-
- /**
- * Returns true if the given string is accepted by the automaton.
- *
- * Complexity: linear in the length of the string.
- *
- * Note: for full performance, use the {@link RunAutomaton} class.
- */
- public static boolean run(Automaton a, String s) {
- if (a.isSingleton()) return s.equals(a.singleton);
- if (a.deterministic) {
- State p = a.initial;
- for (int i = 0, cp = 0; i < s.length(); i += Character.charCount(cp)) {
- State q = p.step(cp = s.codePointAt(i));
- if (q == null) return false;
- p = q;
- }
- return p.accept;
- } else {
- State[] states = a.getNumberedStates();
- LinkedList pp = new LinkedList<>();
- LinkedList pp_other = new LinkedList<>();
- BitSet bb = new BitSet(states.length);
- BitSet bb_other = new BitSet(states.length);
- pp.add(a.initial);
- ArrayList dest = new ArrayList<>();
- boolean accept = a.initial.accept;
- for (int i = 0, c = 0; i < s.length(); i += Character.charCount(c)) {
- c = s.codePointAt(i);
- accept = false;
- pp_other.clear();
- bb_other.clear();
- for (State p : pp) {
- dest.clear();
- p.step(c, dest);
- for (State q : dest) {
- if (q.accept) accept = true;
- if (!bb_other.get(q.number)) {
- bb_other.set(q.number);
- pp_other.add(q);
- }
- }
- }
- LinkedList tp = pp;
- pp = pp_other;
- pp_other = tp;
- BitSet tb = bb;
- bb = bb_other;
- bb_other = tb;
- }
- return accept;
- }
- }
-}
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java
index 8c8d68a24d4b..5804ef13a92b 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java
@@ -21,7 +21,8 @@
* Automaton representation for matching UTF-8 byte[].
*/
public class ByteRunAutomaton extends RunAutomaton {
-
+
+ /** Converts incoming automaton to byte-based (UTF32ToUTF8) first */
public ByteRunAutomaton(Automaton a) {
this(a, false);
}
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java
index 2dcd922871cb..8582870c52cd 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java
@@ -22,6 +22,7 @@
*/
public class CharacterRunAutomaton extends RunAutomaton {
+ /** Sole constructor. */
public CharacterRunAutomaton(Automaton a) {
super(a, Character.MAX_CODE_POINT, false);
}
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java
index 6d8666fd4a9e..ab2358a0988e 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java
@@ -19,7 +19,6 @@
import java.io.IOException;
import java.util.ArrayList;
-import java.util.Arrays;
import java.util.List;
import org.apache.lucene.index.Terms;
@@ -52,6 +51,8 @@ public enum AUTOMATON_TYPE {
/** Catch-all for any other automata. */
NORMAL
};
+
+ /** If simplify is true this will be the "simplified" type; else, this is NORMAL */
public final AUTOMATON_TYPE type;
/**
@@ -65,21 +66,22 @@ public enum AUTOMATON_TYPE {
* only valid for {@link AUTOMATON_TYPE#NORMAL}.
*/
public final ByteRunAutomaton runAutomaton;
- // TODO: would be nice if these sortedTransitions had "int
- // to;" instead of "State to;" somehow:
+
/**
* Two dimensional array of transitions, indexed by state
* number for traversal. The state numbering is consistent with
* {@link #runAutomaton}.
* Only valid for {@link AUTOMATON_TYPE#NORMAL}.
*/
- public final Transition[][] sortedTransitions;
+ public final Automaton automaton;
+
/**
* Shared common suffix accepted by the automaton. Only valid
* for {@link AUTOMATON_TYPE#NORMAL}, and only when the
* automaton accepts an infinite language.
*/
public final BytesRef commonSuffixRef;
+
/**
* Indicates if the automaton accepts a finite set of strings.
* Null if this was not computed.
@@ -87,125 +89,157 @@ public enum AUTOMATON_TYPE {
*/
public final Boolean finite;
+ /** Create this, passing simplify=true and finite=null, so that we try
+ * to simplify the automaton and determine if it is finite. */
public CompiledAutomaton(Automaton automaton) {
this(automaton, null, true);
}
+ /** Create this. If finite is null, we use {@link Operations#isFinite}
+ * to determine whether it is finite. If simplify is true, we run
+ * possibly expensive operations to determine if the automaton is one
+ * the cases in {@link CompiledAutomaton.AUTOMATON_TYPE}. */
public CompiledAutomaton(Automaton automaton, Boolean finite, boolean simplify) {
+ if (automaton.getNumStates() == 0) {
+ automaton = new Automaton();
+ automaton.createState();
+ }
+
if (simplify) {
+
// Test whether the automaton is a "simple" form and
// if so, don't create a runAutomaton. Note that on a
// large automaton these tests could be costly:
- if (BasicOperations.isEmpty(automaton)) {
+
+ if (Operations.isEmpty(automaton)) {
// matches nothing
type = AUTOMATON_TYPE.NONE;
term = null;
commonSuffixRef = null;
runAutomaton = null;
- sortedTransitions = null;
+ this.automaton = null;
this.finite = null;
return;
- } else if (BasicOperations.isTotal(automaton)) {
+ // NOTE: only approximate, because automaton may not be minimal:
+ } else if (Operations.isTotal(automaton)) {
// matches all possible strings
type = AUTOMATON_TYPE.ALL;
term = null;
commonSuffixRef = null;
runAutomaton = null;
- sortedTransitions = null;
+ this.automaton = null;
this.finite = null;
return;
} else {
- final String commonPrefix;
+
+ automaton = Operations.determinize(automaton);
+
+ final String commonPrefix = Operations.getCommonPrefix(automaton);
final String singleton;
- if (automaton.getSingleton() == null) {
- commonPrefix = SpecialOperations.getCommonPrefix(automaton);
- if (commonPrefix.length() > 0 && BasicOperations.sameLanguage(automaton, BasicAutomata.makeString(commonPrefix))) {
- singleton = commonPrefix;
- } else {
- singleton = null;
- }
+
+ if (commonPrefix.length() > 0 && Operations.sameLanguage(automaton, Automata.makeString(commonPrefix))) {
+ singleton = commonPrefix;
} else {
- commonPrefix = null;
- singleton = automaton.getSingleton();
+ singleton = null;
}
-
+
if (singleton != null) {
- // matches a fixed string in singleton or expanded
- // representation
+ // matches a fixed string
type = AUTOMATON_TYPE.SINGLE;
term = new BytesRef(singleton);
commonSuffixRef = null;
runAutomaton = null;
- sortedTransitions = null;
- this.finite = null;
- return;
- } else if (BasicOperations.sameLanguage(automaton, BasicOperations.concatenate(
- BasicAutomata.makeString(commonPrefix), BasicAutomata.makeAnyString()))) {
- // matches a constant prefix
- type = AUTOMATON_TYPE.PREFIX;
- term = new BytesRef(commonPrefix);
- commonSuffixRef = null;
- runAutomaton = null;
- sortedTransitions = null;
+ this.automaton = null;
this.finite = null;
return;
+ } else if (commonPrefix.length() > 0) {
+ Automaton other = Operations.concatenate(Automata.makeString(commonPrefix), Automata.makeAnyString());
+ other = Operations.determinize(other);
+ assert Operations.hasDeadStates(other) == false;
+ if (Operations.sameLanguage(automaton, other)) {
+ // matches a constant prefix
+ type = AUTOMATON_TYPE.PREFIX;
+ term = new BytesRef(commonPrefix);
+ commonSuffixRef = null;
+ runAutomaton = null;
+ this.automaton = null;
+ this.finite = null;
+ return;
+ }
}
}
}
type = AUTOMATON_TYPE.NORMAL;
term = null;
+
if (finite == null) {
- this.finite = SpecialOperations.isFinite(automaton);
+ this.finite = Operations.isFinite(automaton);
} else {
this.finite = finite;
}
+
Automaton utf8 = new UTF32ToUTF8().convert(automaton);
if (this.finite) {
commonSuffixRef = null;
} else {
- commonSuffixRef = SpecialOperations.getCommonSuffixBytesRef(utf8);
+ commonSuffixRef = Operations.getCommonSuffixBytesRef(utf8);
}
runAutomaton = new ByteRunAutomaton(utf8, true);
- sortedTransitions = utf8.getSortedTransitions();
+
+ this.automaton = runAutomaton.automaton;
}
+
+ private Transition transition = new Transition();
//private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG;
private BytesRef addTail(int state, BytesRef term, int idx, int leadLabel) {
-
+ //System.out.println("addTail state=" + state + " term=" + term.utf8ToString() + " idx=" + idx + " leadLabel=" + (char) leadLabel);
+ //System.out.println(automaton.toDot());
// Find biggest transition that's < label
// TODO: use binary search here
- Transition maxTransition = null;
- for (Transition transition : sortedTransitions[state]) {
+ int maxIndex = -1;
+ int numTransitions = automaton.initTransition(state, transition);
+ for(int i=0;i ").append(i).append("\n");
- }
- for (int j = 0; j < sortedTransitions[i].length; j++) {
- b.append(" ").append(i);
- sortedTransitions[i][j].appendDot(b);
- }
- }
- return b.append("}\n").toString();
- }
@Override
public int hashCode() {
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java b/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java
index 68ce1e9a0517..f96b837b8935 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java
@@ -29,7 +29,7 @@
* (nearly linear with the input size).
*
* @see #build(Collection)
- * @see BasicAutomata#makeStringUnion(Collection)
+ * @see Automata#makeStringUnion(Collection)
*/
final class DaciukMihovAutomatonBuilder {
/**
@@ -249,20 +249,22 @@ public State complete() {
/**
* Internal recursive traversal for conversion.
*/
- private static org.apache.lucene.util.automaton.State convert(State s,
- IdentityHashMap visited) {
- org.apache.lucene.util.automaton.State converted = visited.get(s);
- if (converted != null) return converted;
+ private static int convert(Automaton.Builder a, State s,
+ IdentityHashMap visited) {
+
+ Integer converted = visited.get(s);
+ if (converted != null) {
+ return converted;
+ }
- converted = new org.apache.lucene.util.automaton.State();
- converted.setAccept(s.is_final);
+ converted = a.createState();
+ a.setAccept(converted, s.is_final);
visited.put(s, converted);
int i = 0;
int[] labels = s.labels;
for (DaciukMihovAutomatonBuilder.State target : s.states) {
- converted.addTransition(
- new Transition(labels[i++], convert(target, visited)));
+ a.addTransition(converted, convert(a, target, visited), labels[i++]);
}
return converted;
@@ -281,12 +283,12 @@ public static Automaton build(Collection input) {
builder.add(scratch);
}
- Automaton a = new Automaton();
- a.initial = convert(
+ Automaton.Builder a = new Automaton.Builder();
+ convert(a,
builder.complete(),
- new IdentityHashMap());
- a.deterministic = true;
- return a;
+ new IdentityHashMap());
+
+ return a.finish();
}
/**
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java b/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java
index 869dc3ad5515..01badf0fa99e 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java
@@ -21,6 +21,8 @@
import java.util.SortedSet;
import java.util.TreeSet;
+import org.apache.lucene.util.UnicodeUtil;
+
/**
* Class to construct DFAs that match a word within some edit distance.
*
@@ -30,7 +32,8 @@
* @lucene.experimental
*/
public class LevenshteinAutomata {
- /** @lucene.internal */
+ /** Maximum edit distance this class can generate an automaton for.
+ * @lucene.internal */
public static final int MAXIMUM_SUPPORTED_DISTANCE = 2;
/* input word */
final int word[];
@@ -112,7 +115,7 @@ private static int[] codePoints(String input) {
}
return word;
}
-
+
/**
* Compute a DFA that accepts all strings within an edit distance of n
.
*
@@ -125,8 +128,25 @@ private static int[] codePoints(String input) {
*
*/
public Automaton toAutomaton(int n) {
+ return toAutomaton(n, "");
+ }
+
+ /**
+ * Compute a DFA that accepts all strings within an edit distance of n
,
+ * matching the specified exact prefix.
+ *
+ * All automata have the following properties:
+ *
+ * - They are deterministic (DFA).
+ *
- There are no transitions to dead states.
+ *
- They are not minimal (some transitions could be combined).
+ *
+ *
+ */
+ public Automaton toAutomaton(int n, String prefix) {
+ assert prefix != null;
if (n == 0) {
- return BasicAutomata.makeString(word, 0, word.length);
+ return Automata.makeString(prefix + UnicodeUtil.newString(word, 0, word.length));
}
if (n >= descriptions.length)
@@ -135,15 +155,36 @@ public Automaton toAutomaton(int n) {
final int range = 2*n+1;
ParametricDescription description = descriptions[n];
// the number of states is based on the length of the word and n
- State states[] = new State[description.size()];
+ int numStates = description.size();
+
+ Automaton a = new Automaton();
+ int lastState;
+ if (prefix != null) {
+ // Insert prefix
+ lastState = a.createState();
+ for (int i = 0, cp = 0; i < prefix.length(); i += Character.charCount(cp)) {
+ int state = a.createState();
+ cp = prefix.codePointAt(i);
+ a.addTransition(lastState, state, cp, cp);
+ lastState = state;
+ }
+ } else {
+ lastState = a.createState();
+ }
+
+ int stateOffset = lastState;
+ a.setAccept(lastState, description.isAccept(0));
+
// create all states, and mark as accept states if appropriate
- for (int i = 0; i < states.length; i++) {
- states[i] = new State();
- states[i].number = i;
- states[i].setAccept(description.isAccept(i));
+ for (int i = 1; i < numStates; i++) {
+ int state = a.createState();
+ a.setAccept(state, description.isAccept(i));
}
+
+ // TODO: this creates bogus states/transitions (states are final, have self loops, and can't be reached from an init state)
+
// create transitions from state to state
- for (int k = 0; k < states.length; k++) {
+ for (int k = 0; k < numStates; k++) {
final int xpos = description.getPosition(k);
if (xpos < 0)
continue;
@@ -154,31 +195,26 @@ public Automaton toAutomaton(int n) {
// get the characteristic vector at this position wrt ch
final int cvec = getVector(ch, xpos, end);
int dest = description.transition(k, xpos, cvec);
- if (dest >= 0)
- states[k].addTransition(new Transition(ch, states[dest]));
+ if (dest >= 0) {
+ a.addTransition(stateOffset+k, stateOffset+dest, ch);
+ }
}
// add transitions for all other chars in unicode
// by definition, their characteristic vectors are always 0,
// because they do not exist in the input string.
int dest = description.transition(k, xpos, 0); // by definition
- if (dest >= 0)
- for (int r = 0; r < numRanges; r++)
- states[k].addTransition(new Transition(rangeLower[r], rangeUpper[r], states[dest]));
+ if (dest >= 0) {
+ for (int r = 0; r < numRanges; r++) {
+ a.addTransition(stateOffset+k, stateOffset+dest, rangeLower[r], rangeUpper[r]);
+ }
+ }
}
- Automaton a = new Automaton(states[0]);
- a.setDeterministic(true);
- // we create some useless unconnected states, and its a net-win overall to remove these,
- // as well as to combine any adjacent transitions (it makes later algorithms more efficient).
- // so, while we could set our numberedStates here, its actually best not to, and instead to
- // force a traversal in reduce, pruning the unconnected states while we combine adjacent transitions.
- //a.setNumberedStates(states);
- a.reduce();
- // we need not trim transitions to dead states, as they are not created.
- //a.restoreInvariant();
+ a.finishState();
+ assert a.isDeterministic();
return a;
}
-
+
/**
* Get the characteristic vector X(x, V)
* where V is substring(pos, end)
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java b/lucene/core/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java
index 85f8d58762df..223b25b730fd 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java
@@ -46,40 +46,43 @@ private MinimizationOperations() {}
/**
* Minimizes (and determinizes if not already deterministic) the given
* automaton.
- *
- * @see Automaton#setMinimization(int)
*/
- public static void minimize(Automaton a) {
- if (!a.isSingleton()) {
- minimizeHopcroft(a);
- }
- // recompute hash code
- //a.hash_code = 1a.getNumberOfStates() * 3 + a.getNumberOfTransitions() * 2;
- //if (a.hash_code == 0) a.hash_code = 1;
+ public static Automaton minimize(Automaton a) {
+ return minimizeHopcroft(a);
}
/**
* Minimizes the given automaton using Hopcroft's algorithm.
*/
- public static void minimizeHopcroft(Automaton a) {
- a.determinize();
- if (a.initial.numTransitions == 1) {
- Transition t = a.initial.transitionsArray[0];
- if (t.to == a.initial && t.min == Character.MIN_CODE_POINT
- && t.max == Character.MAX_CODE_POINT) return;
+ public static Automaton minimizeHopcroft(Automaton a) {
+ if (a.getNumStates() == 0 || (a.isAccept(0) == false && a.getNumTransitions(0) == 0)) {
+ // Fastmatch for common case
+ return new Automaton();
+ }
+ a = Operations.determinize(a);
+ //a.writeDot("adet");
+ if (a.getNumTransitions(0) == 1) {
+ Transition t = new Transition();
+ a.getTransition(0, 0, t);
+ if (t.dest == 0 && t.min == Character.MIN_CODE_POINT
+ && t.max == Character.MAX_CODE_POINT) {
+ // Accepts all strings
+ return a;
+ }
}
- a.totalize();
+ a = Operations.totalize(a);
+ //a.writeDot("atot");
// initialize data structures
final int[] sigma = a.getStartPoints();
- final State[] states = a.getNumberedStates();
- final int sigmaLen = sigma.length, statesLen = states.length;
- @SuppressWarnings({"rawtypes","unchecked"}) final ArrayList[][] reverse =
- (ArrayList[][]) new ArrayList[statesLen][sigmaLen];
- @SuppressWarnings({"rawtypes","unchecked"}) final HashSet[] partition =
- (HashSet[]) new HashSet[statesLen];
- @SuppressWarnings({"rawtypes","unchecked"}) final ArrayList[] splitblock =
- (ArrayList[]) new ArrayList[statesLen];
+ final int sigmaLen = sigma.length, statesLen = a.getNumStates();
+
+ @SuppressWarnings({"rawtypes","unchecked"}) final ArrayList[][] reverse =
+ (ArrayList[][]) new ArrayList[statesLen][sigmaLen];
+ @SuppressWarnings({"rawtypes","unchecked"}) final HashSet[] partition =
+ (HashSet[]) new HashSet[statesLen];
+ @SuppressWarnings({"rawtypes","unchecked"}) final ArrayList[] splitblock =
+ (ArrayList[]) new ArrayList[statesLen];
final int[] block = new int[statesLen];
final StateList[][] active = new StateList[statesLen][sigmaLen];
final StateListNode[][] active2 = new StateListNode[statesLen][sigmaLen];
@@ -96,71 +99,78 @@ public static void minimizeHopcroft(Automaton a) {
}
// find initial partition and reverse edges
for (int q = 0; q < statesLen; q++) {
- final State qq = states[q];
- final int j = qq.accept ? 0 : 1;
- partition[j].add(qq);
+ final int j = a.isAccept(q) ? 0 : 1;
+ partition[j].add(q);
block[q] = j;
for (int x = 0; x < sigmaLen; x++) {
- final ArrayList[] r =
- reverse[qq.step(sigma[x]).number];
- if (r[x] == null)
+ final ArrayList[] r = reverse[a.step(q, sigma[x])];
+ if (r[x] == null) {
r[x] = new ArrayList<>();
- r[x].add(qq);
+ }
+ r[x].add(q);
}
}
// initialize active sets
for (int j = 0; j <= 1; j++) {
for (int x = 0; x < sigmaLen; x++) {
- for (final State qq : partition[j]) {
- if (reverse[qq.number][x] != null)
- active2[qq.number][x] = active[j][x].add(qq);
+ for (int q : partition[j]) {
+ if (reverse[q][x] != null) {
+ active2[q][x] = active[j][x].add(q);
+ }
}
}
}
+
// initialize pending
for (int x = 0; x < sigmaLen; x++) {
final int j = (active[0][x].size <= active[1][x].size) ? 0 : 1;
pending.add(new IntPair(j, x));
pending2.set(x*statesLen + j);
}
+
// process pending until fixed point
int k = 2;
+ //System.out.println("start min");
while (!pending.isEmpty()) {
+ //System.out.println(" cycle pending");
final IntPair ip = pending.removeFirst();
final int p = ip.n1;
final int x = ip.n2;
+ //System.out.println(" pop n1=" + ip.n1 + " n2=" + ip.n2);
pending2.clear(x*statesLen + p);
// find states that need to be split off their blocks
for (StateListNode m = active[p][x].first; m != null; m = m.next) {
- final ArrayList r = reverse[m.q.number][x];
- if (r != null) for (final State s : r) {
- final int i = s.number;
- if (!split.get(i)) {
- split.set(i);
- final int j = block[i];
- splitblock[j].add(s);
- if (!refine2.get(j)) {
- refine2.set(j);
- refine.set(j);
+ final ArrayList r = reverse[m.q][x];
+ if (r != null) {
+ for (int i : r) {
+ if (!split.get(i)) {
+ split.set(i);
+ final int j = block[i];
+ splitblock[j].add(i);
+ if (!refine2.get(j)) {
+ refine2.set(j);
+ refine.set(j);
+ }
}
}
}
}
+
// refine blocks
for (int j = refine.nextSetBit(0); j >= 0; j = refine.nextSetBit(j+1)) {
- final ArrayList sb = splitblock[j];
+ final ArrayList sb = splitblock[j];
if (sb.size() < partition[j].size()) {
- final HashSet b1 = partition[j];
- final HashSet b2 = partition[k];
- for (final State s : sb) {
+ final HashSet b1 = partition[j];
+ final HashSet b2 = partition[k];
+ for (int s : sb) {
b1.remove(s);
b2.add(s);
- block[s.number] = k;
+ block[s] = k;
for (int c = 0; c < sigmaLen; c++) {
- final StateListNode sn = active2[s.number][c];
+ final StateListNode sn = active2[s][c];
if (sn != null && sn.sl == active[j][c]) {
sn.remove();
- active2[s.number][c] = active[k][c].add(s);
+ active2[s][c] = active[k][c].add(s);
}
}
}
@@ -180,33 +190,69 @@ public static void minimizeHopcroft(Automaton a) {
k++;
}
refine2.clear(j);
- for (final State s : sb)
- split.clear(s.number);
+ for (int s : sb) {
+ split.clear(s);
+ }
sb.clear();
}
refine.clear();
}
+
+ Automaton result = new Automaton();
+
+ Transition t = new Transition();
+
+ //System.out.println(" k=" + k);
+
// make a new state for each equivalence class, set initial state
- State[] newstates = new State[k];
- for (int n = 0; n < newstates.length; n++) {
- final State s = new State();
- newstates[n] = s;
- for (State q : partition[n]) {
- if (q == a.initial) a.initial = s;
- s.accept = q.accept;
- s.number = q.number; // select representative
- q.number = n;
+ int[] stateMap = new int[statesLen];
+ int[] stateRep = new int[k];
+
+ result.createState();
+
+ //System.out.println("min: k=" + k);
+ for (int n = 0; n < k; n++) {
+ //System.out.println(" n=" + n);
+
+ boolean isInitial = false;
+ for (int q : partition[n]) {
+ if (q == 0) {
+ isInitial = true;
+ //System.out.println(" isInitial!");
+ break;
+ }
+ }
+
+ int newState;
+ if (isInitial) {
+ newState = 0;
+ } else {
+ newState = result.createState();
+ }
+
+ //System.out.println(" newState=" + newState);
+
+ for (int q : partition[n]) {
+ stateMap[q] = newState;
+ //System.out.println(" q=" + q + " isAccept?=" + a.isAccept(q));
+ result.setAccept(newState, a.isAccept(q));
+ stateRep[newState] = q; // select representative
}
}
+
// build transitions and set acceptance
- for (int n = 0; n < newstates.length; n++) {
- final State s = newstates[n];
- s.accept = states[s.number].accept;
- for (Transition t : states[s.number].getTransitions())
- s.addTransition(new Transition(t.min, t.max, newstates[t.to.number]));
+ for (int n = 0; n < k; n++) {
+ int numTransitions = a.initTransition(stateRep[n], t);
+ for(int i=0;i
+ * Complexity: linear in total number of states.
+ */
+ static public Automaton concatenate(Automaton a1, Automaton a2) {
+ return concatenate(Arrays.asList(a1, a2));
+ }
+
+ /**
+ * Returns an automaton that accepts the concatenation of the languages of the
+ * given automata.
+ *
+ * Complexity: linear in total number of states.
+ */
+ static public Automaton concatenate(List l) {
+ Automaton result = new Automaton();
+
+ // First pass: create all states
+ for(Automaton a : l) {
+ if (a.getNumStates() == 0) {
+ result.finishState();
+ return result;
+ }
+ int numStates = a.getNumStates();
+ for(int s=0;s
+ * Complexity: linear in number of states.
+ */
+ static public Automaton optional(Automaton a) {
+ Automaton result = new Automaton();
+ result.createState();
+ result.setAccept(0, true);
+ if (a.getNumStates() > 0) {
+ result.copy(a);
+ result.addEpsilon(0, 1);
+ }
+ result.finishState();
+ return result;
+ }
+
+ /**
+ * Returns an automaton that accepts the Kleene star (zero or more
+ * concatenated repetitions) of the language of the given automaton. Never
+ * modifies the input automaton language.
+ *
+ * Complexity: linear in number of states.
+ */
+ static public Automaton repeat(Automaton a) {
+ Automaton.Builder builder = new Automaton.Builder();
+ builder.createState();
+ builder.setAccept(0, true);
+ builder.copy(a);
+
+ Transition t = new Transition();
+ int count = a.initTransition(0, t);
+ for(int i=0;imin or more concatenated
+ * repetitions of the language of the given automaton.
+ *
+ * Complexity: linear in number of states and in min
.
+ */
+ static public Automaton repeat(Automaton a, int min) {
+ if (min == 0) {
+ return repeat(a);
+ }
+ List as = new ArrayList<>();
+ while (min-- > 0) {
+ as.add(a);
+ }
+ as.add(repeat(a));
+ return concatenate(as);
+ }
+
+ /**
+ * Returns an automaton that accepts between min
and
+ * max
(including both) concatenated repetitions of the language
+ * of the given automaton.
+ *
+ * Complexity: linear in number of states and in min
and
+ * max
.
+ */
+ static public Automaton repeat(Automaton a, int min, int max) {
+ if (min > max) {
+ return Automata.makeEmpty();
+ }
+
+ Automaton b;
+ if (min == 0) {
+ b = Automata.makeEmptyString();
+ } else if (min == 1) {
+ b = new Automaton();
+ b.copy(a);
+ } else {
+ List as = new ArrayList<>();
+ for(int i=0;i prevAcceptStates = toSet(b, 0);
+
+ for(int i=min;i toSet(Automaton a, int offset) {
+ int numStates = a.getNumStates();
+ BitSet isAccept = a.getAcceptStates();
+ Set result = new HashSet();
+ int upto = 0;
+ while (upto < numStates && (upto = isAccept.nextSetBit(upto)) != -1) {
+ result.add(offset+upto);
+ upto++;
+ }
+
+ return result;
+ }
+
+ /**
+ * Returns a (deterministic) automaton that accepts the complement of the
+ * language of the given automaton.
+ *
+ * Complexity: linear in number of states (if already deterministic).
+ */
+ static public Automaton complement(Automaton a) {
+ a = totalize(determinize(a));
+ int numStates = a.getNumStates();
+ for (int p=0;pa1 and the complement of the language of
+ * a2
. As a side-effect, the automata may be determinized, if not
+ * already deterministic.
+ *
+ * Complexity: quadratic in number of states (if already deterministic).
+ */
+ static public Automaton minus(Automaton a1, Automaton a2) {
+ if (Operations.isEmpty(a1) || a1 == a2) {
+ return Automata.makeEmpty();
+ }
+ if (Operations.isEmpty(a2)) {
+ return a1;
+ }
+ return intersection(a1, complement(a2));
+ }
+
+ /**
+ * Returns an automaton that accepts the intersection of the languages of the
+ * given automata. Never modifies the input automata languages.
+ *
+ * Complexity: quadratic in number of states.
+ */
+ static public Automaton intersection(Automaton a1, Automaton a2) {
+ if (a1 == a2) {
+ return a1;
+ }
+ if (a1.getNumStates() == 0) {
+ return a1;
+ }
+ if (a2.getNumStates() == 0) {
+ return a2;
+ }
+ Transition[][] transitions1 = a1.getSortedTransitions();
+ Transition[][] transitions2 = a2.getSortedTransitions();
+ Automaton c = new Automaton();
+ c.createState();
+ LinkedList worklist = new LinkedList<>();
+ HashMap newstates = new HashMap<>();
+ StatePair p = new StatePair(0, 0, 0);
+ worklist.add(p);
+ newstates.put(p, p);
+ while (worklist.size() > 0) {
+ p = worklist.removeFirst();
+ c.setAccept(p.s, a1.isAccept(p.s1) && a2.isAccept(p.s2));
+ Transition[] t1 = transitions1[p.s1];
+ Transition[] t2 = transitions2[p.s2];
+ for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) {
+ while (b2 < t2.length && t2[b2].max < t1[n1].min)
+ b2++;
+ for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++)
+ if (t2[n2].max >= t1[n1].min) {
+ StatePair q = new StatePair(t1[n1].dest, t2[n2].dest);
+ StatePair r = newstates.get(q);
+ if (r == null) {
+ q.s = c.createState();
+ worklist.add(q);
+ newstates.put(q, q);
+ r = q;
+ }
+ int min = t1[n1].min > t2[n2].min ? t1[n1].min : t2[n2].min;
+ int max = t1[n1].max < t2[n2].max ? t1[n1].max : t2[n2].max;
+ c.addTransition(p.s, r.s, min, max);
+ }
+ }
+ }
+ c.finishState();
+
+ return removeDeadStates(c);
+ }
+
+ /** Returns true if these two automata accept exactly the
+ * same language. This is a costly computation! Note
+ * also that a1 and a2 will be determinized as a side
+ * effect. Both automata must be determinized and have
+ * no dead states! */
+ public static boolean sameLanguage(Automaton a1, Automaton a2) {
+ if (a1 == a2) {
+ return true;
+ }
+ return subsetOf(a2, a1) && subsetOf(a1, a2);
+ }
+
+ // TODO: move to test-framework?
+ /** Returns true if this automaton has any states that cannot
+ * be reached from the initial state or cannot reach an accept state.
+ * Cost is O(numTransitions+numStates). */
+ public static boolean hasDeadStates(Automaton a) {
+ BitSet liveStates = getLiveStates(a);
+ int numLive = liveStates.cardinality();
+ int numStates = a.getNumStates();
+ assert numLive <= numStates: "numLive=" + numLive + " numStates=" + numStates + " " + liveStates;
+ return numLive < numStates;
+ }
+
+ // TODO: move to test-framework?
+ /** Returns true if there are dead states reachable from an initial state. */
+ public static boolean hasDeadStatesFromInitial(Automaton a) {
+ BitSet reachableFromInitial = getLiveStatesFromInitial(a);
+ BitSet reachableFromAccept = getLiveStatesToAccept(a);
+ reachableFromInitial.andNot(reachableFromAccept);
+ return reachableFromInitial.isEmpty() == false;
+ }
+
+ // TODO: move to test-framework?
+ /** Returns true if there are dead states that reach an accept state. */
+ public static boolean hasDeadStatesToAccept(Automaton a) {
+ BitSet reachableFromInitial = getLiveStatesFromInitial(a);
+ BitSet reachableFromAccept = getLiveStatesToAccept(a);
+ reachableFromAccept.andNot(reachableFromInitial);
+ return reachableFromAccept.isEmpty() == false;
+ }
+
+ /**
+ * Returns true if the language of a1
is a subset of the language
+ * of a2
. Both automata must be determinized and must have no dead
+ * states.
+ *
+ * Complexity: quadratic in number of states.
+ */
+ public static boolean subsetOf(Automaton a1, Automaton a2) {
+ if (a1.isDeterministic() == false) {
+ throw new IllegalArgumentException("a1 must be deterministic");
+ }
+ if (a2.isDeterministic() == false) {
+ throw new IllegalArgumentException("a2 must be deterministic");
+ }
+ assert hasDeadStatesFromInitial(a1) == false;
+ assert hasDeadStatesFromInitial(a2) == false;
+ if (a1.getNumStates() == 0) {
+ // Empty language is alwyas a subset of any other language
+ return true;
+ } else if (a2.getNumStates() == 0) {
+ return isEmpty(a1);
+ }
+
+ // TODO: cutover to iterators instead
+ Transition[][] transitions1 = a1.getSortedTransitions();
+ Transition[][] transitions2 = a2.getSortedTransitions();
+ LinkedList worklist = new LinkedList<>();
+ HashSet visited = new HashSet<>();
+ StatePair p = new StatePair(0, 0);
+ worklist.add(p);
+ visited.add(p);
+ while (worklist.size() > 0) {
+ p = worklist.removeFirst();
+ if (a1.isAccept(p.s1) && a2.isAccept(p.s2) == false) {
+ return false;
+ }
+ Transition[] t1 = transitions1[p.s1];
+ Transition[] t2 = transitions2[p.s2];
+ for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) {
+ while (b2 < t2.length && t2[b2].max < t1[n1].min) {
+ b2++;
+ }
+ int min1 = t1[n1].min, max1 = t1[n1].max;
+
+ for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++) {
+ if (t2[n2].min > min1) {
+ return false;
+ }
+ if (t2[n2].max < Character.MAX_CODE_POINT) {
+ min1 = t2[n2].max + 1;
+ } else {
+ min1 = Character.MAX_CODE_POINT;
+ max1 = Character.MIN_CODE_POINT;
+ }
+ StatePair q = new StatePair(t1[n1].dest, t2[n2].dest);
+ if (!visited.contains(q)) {
+ worklist.add(q);
+ visited.add(q);
+ }
+ }
+ if (min1 <= max1) {
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Returns an automaton that accepts the union of the languages of the given
+ * automata.
+ *
+ * Complexity: linear in number of states.
+ */
+ public static Automaton union(Automaton a1, Automaton a2) {
+ return union(Arrays.asList(a1, a2));
+ }
+
+ /**
+ * Returns an automaton that accepts the union of the languages of the given
+ * automata.
+ *
+ * Complexity: linear in number of states.
+ */
+ public static Automaton union(Collection l) {
+ Automaton result = new Automaton();
+
+ // Create initial state:
+ result.createState();
+
+ // Copy over all automata
+ Transition t = new Transition();
+ for(Automaton a : l) {
+ result.copy(a);
+ }
+
+ // Add epsilon transition from new initial state
+ int stateOffset = 1;
+ for(Automaton a : l) {
+ if (a.getNumStates() == 0) {
+ continue;
+ }
+ result.addEpsilon(0, stateOffset);
+ stateOffset += a.getNumStates();
+ }
+
+ result.finishState();
+
+ return result;
+ }
+
+ // Simple custom ArrayList
+ private final static class TransitionList {
+ // dest, min, max
+ int[] transitions = new int[3];
+ int next;
+
+ public void add(Transition t) {
+ if (transitions.length < next+3) {
+ transitions = ArrayUtil.grow(transitions, next+3);
+ }
+ transitions[next] = t.dest;
+ transitions[next+1] = t.min;
+ transitions[next+2] = t.max;
+ next += 3;
+ }
+ }
+
+ // Holds all transitions that start on this int point, or
+ // end at this point-1
+ private final static class PointTransitions implements Comparable {
+ int point;
+ final TransitionList ends = new TransitionList();
+ final TransitionList starts = new TransitionList();
+
+ @Override
+ public int compareTo(PointTransitions other) {
+ return point - other.point;
+ }
+
+ public void reset(int point) {
+ this.point = point;
+ ends.next = 0;
+ starts.next = 0;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ return ((PointTransitions) other).point == point;
+ }
+
+ @Override
+ public int hashCode() {
+ return point;
+ }
+ }
+
+ private final static class PointTransitionSet {
+ int count;
+ PointTransitions[] points = new PointTransitions[5];
+
+ private final static int HASHMAP_CUTOVER = 30;
+ private final HashMap map = new HashMap<>();
+ private boolean useHash = false;
+
+ private PointTransitions next(int point) {
+ // 1st time we are seeing this point
+ if (count == points.length) {
+ final PointTransitions[] newArray = new PointTransitions[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
+ System.arraycopy(points, 0, newArray, 0, count);
+ points = newArray;
+ }
+ PointTransitions points0 = points[count];
+ if (points0 == null) {
+ points0 = points[count] = new PointTransitions();
+ }
+ points0.reset(point);
+ count++;
+ return points0;
+ }
+
+ private PointTransitions find(int point) {
+ if (useHash) {
+ final Integer pi = point;
+ PointTransitions p = map.get(pi);
+ if (p == null) {
+ p = next(point);
+ map.put(pi, p);
+ }
+ return p;
+ } else {
+ for(int i=0;i 1) ArrayUtil.timSort(points, 0, count);
+ }
+
+ public void add(Transition t) {
+ find(t.min).starts.add(t);
+ find(1+t.max).ends.add(t);
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder s = new StringBuilder();
+ for(int i=0;i 0) {
+ s.append(' ');
+ }
+ s.append(points[i].point).append(':').append(points[i].starts.next/3).append(',').append(points[i].ends.next/3);
+ }
+ return s.toString();
+ }
+ }
+
+ /**
+ * Determinizes the given automaton.
+ *
+ * Worst case complexity: exponential in number of states.
+ */
+ public static Automaton determinize(Automaton a) {
+ if (a.isDeterministic()) {
+ // Already determinized
+ return a;
+ }
+ if (a.getNumStates() <= 1) {
+ // Already determinized
+ return a;
+ }
+
+ // subset construction
+ Automaton.Builder b = new Automaton.Builder();
+
+ //System.out.println("DET:");
+ //a.writeDot("/l/la/lucene/core/detin.dot");
+
+ SortedIntSet.FrozenIntSet initialset = new SortedIntSet.FrozenIntSet(0, 0);
+
+ // Create state 0:
+ b.createState();
+
+ LinkedList worklist = new LinkedList<>();
+ Map newstate = new HashMap<>();
+
+ worklist.add(initialset);
+
+ b.setAccept(0, a.isAccept(0));
+ newstate.put(initialset, 0);
+
+ int newStateUpto = 0;
+ int[] newStatesArray = new int[5];
+ newStatesArray[newStateUpto] = 0;
+ newStateUpto++;
+
+ // like Set
+ final PointTransitionSet points = new PointTransitionSet();
+
+ // like SortedMap
+ final SortedIntSet statesSet = new SortedIntSet(5);
+
+ Transition t = new Transition();
+
+ while (worklist.size() > 0) {
+ SortedIntSet.FrozenIntSet s = worklist.removeFirst();
+ //System.out.println("det: pop set=" + s);
+
+ // Collate all outgoing transitions by min/1+max:
+ for(int i=0;i 0) {
+ assert lastPoint != -1;
+
+ statesSet.computeHash();
+
+ Integer q = newstate.get(statesSet);
+ if (q == null) {
+ q = b.createState();
+ final SortedIntSet.FrozenIntSet p = statesSet.freeze(q);
+ //System.out.println(" make new state=" + q + " -> " + p + " accCount=" + accCount);
+ worklist.add(p);
+ b.setAccept(q, accCount > 0);
+ newstate.put(p, q);
+ } else {
+ assert (accCount > 0 ? true:false) == b.isAccept(q): "accCount=" + accCount + " vs existing accept=" +
+ b.isAccept(q) + " states=" + statesSet;
+ }
+
+ // System.out.println(" add trans src=" + r + " dest=" + q + " min=" + lastPoint + " max=" + (point-1));
+
+ b.addTransition(r, q, lastPoint, point-1);
+ }
+
+ // process transitions that end on this point
+ // (closes an overlapping interval)
+ int[] transitions = points.points[i].ends.transitions;
+ int limit = points.points[i].ends.next;
+ for(int j=0;j workList = new LinkedList<>();
+ BitSet seen = new BitSet(a.getNumStates());
+ workList.add(0);
+ seen.set(0);
+
+ Transition t = new Transition();
+ while (workList.isEmpty() == false) {
+ int state = workList.removeFirst();
+ if (a.isAccept(state)) {
+ return false;
+ }
+ int count = a.initTransition(state, t);
+ for(int i=0;i
+ * Complexity: linear in the length of the string.
+ *
+ * Note: for full performance, use the {@link RunAutomaton} class.
+ */
+ public static boolean run(Automaton a, String s) {
+ assert a.isDeterministic();
+ int state = 0;
+ for (int i = 0, cp = 0; i < s.length(); i += Character.charCount(cp)) {
+ int nextState = a.step(state, cp = s.codePointAt(i));
+ if (nextState == -1) {
+ return false;
+ }
+ state = nextState;
+ }
+ return a.isAccept(state);
+ }
+
+ /**
+ * Returns true if the given string (expressed as unicode codepoints) is accepted by the automaton. The input must be deterministic.
+ *
+ * Complexity: linear in the length of the string.
+ *
+ * Note: for full performance, use the {@link RunAutomaton} class.
+ */
+ public static boolean run(Automaton a, IntsRef s) {
+ assert a.isDeterministic();
+ int state = 0;
+ for (int i=0;i workList = new LinkedList<>();
+ live.set(0);
+ workList.add(0);
+
+ Transition t = new Transition();
+ while (workList.isEmpty() == false) {
+ int s = workList.removeFirst();
+ int count = a.initTransition(s, t);
+ for(int i=0;i workList = new LinkedList<>();
+ BitSet live = new BitSet(numStates);
+ BitSet acceptBits = a.getAcceptStates();
+ int s = 0;
+ while (s < numStates && (s = acceptBits.nextSetBit(s)) != -1) {
+ live.set(s);
+ workList.add(s);
+ s++;
+ }
+
+ while (workList.isEmpty() == false) {
+ s = workList.removeFirst();
+ int count = a2.initTransition(s, t);
+ for(int i=0;i visited = new HashSet<>();
+ int s = 0;
+ boolean done;
+ Transition t = new Transition();
+ do {
+ done = true;
+ visited.add(s);
+ if (a.isAccept(s) == false && a.getNumTransitions(s) == 1) {
+ a.getTransition(s, 0, t);
+ if (t.min == t.max && !visited.contains(t.dest)) {
+ b.appendCodePoint(t.min);
+ s = t.dest;
+ done = false;
+ }
+ }
+ } while (!done);
+
+ return b.toString();
+ }
+
+ // TODO: this currently requites a determinized machine,
+ // but it need not -- we can speed it up by walking the
+ // NFA instead. it'd still be fail fast.
+ /**
+ * Returns the longest BytesRef that is a prefix of all accepted strings and
+ * visits each state at most once. The automaton must be deterministic.
+ *
+ * @return common prefix
+ */
+ public static BytesRef getCommonPrefixBytesRef(Automaton a) {
+ BytesRef ref = new BytesRef(10);
+ HashSet visited = new HashSet<>();
+ int s = 0;
+ boolean done;
+ Transition t = new Transition();
+ do {
+ done = true;
+ visited.add(s);
+ if (a.isAccept(s) == false && a.getNumTransitions(s) == 1) {
+ a.getTransition(s, 0, t);
+ if (t.min == t.max && !visited.contains(t.dest)) {
+ ref.grow(++ref.length);
+ ref.bytes[ref.length - 1] = (byte) t.min;
+ s = t.dest;
+ done = false;
+ }
+ }
+ } while (!done);
+
+ return ref;
+ }
+
+ /**
+ * Returns the longest BytesRef that is a suffix of all accepted strings.
+ * Worst case complexity: exponential in number of states (this calls
+ * determinize).
+ *
+ * @return common suffix
+ */
+ public static BytesRef getCommonSuffixBytesRef(Automaton a) {
+ // reverse the language of the automaton, then reverse its common prefix.
+ Automaton r = Operations.determinize(reverse(a));
+ BytesRef ref = getCommonPrefixBytesRef(r);
+ reverseBytes(ref);
+ return ref;
+ }
+
+ private static void reverseBytes(BytesRef ref) {
+ if (ref.length <= 1) return;
+ int num = ref.length >> 1;
+ for (int i = ref.offset; i < ( ref.offset + num ); i++) {
+ byte b = ref.bytes[i];
+ ref.bytes[i] = ref.bytes[ref.offset * 2 + ref.length - i - 1];
+ ref.bytes[ref.offset * 2 + ref.length - i - 1] = b;
+ }
+ }
+
+ /** Returns an automaton accepting the reverse language. */
+ public static Automaton reverse(Automaton a) {
+ return reverse(a, null);
+ }
+
+ /** Reverses the automaton, returning the new initial states. */
+ static Automaton reverse(Automaton a, Set initialStates) {
+
+ if (Operations.isEmpty(a)) {
+ return new Automaton();
+ }
+
+ int numStates = a.getNumStates();
+
+ // Build a new automaton with all edges reversed
+ Automaton.Builder builder = new Automaton.Builder();
+
+ // Initial node; we'll add epsilon transitions in the end:
+ builder.createState();
+
+ for(int s=0;s t.max) {
+ // We've exhaused the current transition's labels;
+ // move to next transitions:
+ transition++;
+ if (transition >= a.getNumTransitions(state)) {
+ // We're done iterating transitions leaving this state
+ return -1;
+ }
+ a.getTransition(state, transition, t);
+ label = t.min;
+ to = t.dest;
+ }
+ return label++;
+ }
+ }
+
+ private static PathNode getNode(PathNode[] nodes, int index) {
+ assert index < nodes.length;
+ if (nodes[index] == null) {
+ nodes[index] = new PathNode();
+ }
+ return nodes[index];
+ }
+
+ // TODO: this is a dangerous method ... Automaton could be
+ // huge ... and it's better in general for caller to
+ // enumerate & process in a single walk:
+
+ /** Returns the set of accepted strings, up to at most
+ * limit
strings. If more than limit
+ * strings are accepted, the first limit strings found are returned. If limit
== -1, then
+ * the limit is infinite. If the {@link Automaton} has
+ * cycles then this method might throw {@code
+ * IllegalArgumentException} but that is not guaranteed
+ * when the limit is set. */
+ public static Set getFiniteStrings(Automaton a, int limit) {
+ Set results = new HashSet<>();
+
+ if (limit == -1 || limit > 0) {
+ // OK
+ } else {
+ throw new IllegalArgumentException("limit must be -1 (which means no limit), or > 0; got: " + limit);
+ }
+
+ if (a.isAccept(0)) {
+ // Special case the empty string, as usual:
+ results.add(new IntsRef());
+ }
+
+ if (a.getNumTransitions(0) > 0 && (limit == -1 || results.size() < limit)) {
+
+ int numStates = a.getNumStates();
+
+ // Tracks which states are in the current path, for
+ // cycle detection:
+ BitSet pathStates = new BitSet(numStates);
+
+ // Stack to hold our current state in the
+ // recursion/iteration:
+ PathNode[] nodes = new PathNode[4];
+
+ pathStates.set(0);
+ PathNode root = getNode(nodes, 0);
+ root.resetState(a, 0);
+
+ IntsRef string = new IntsRef(1);
+ string.length = 1;
+
+ while (string.length > 0) {
+
+ PathNode node = nodes[string.length-1];
+
+ // Get next label leaving the current node:
+ int label = node.nextLabel(a);
+
+ if (label != -1) {
+ string.ints[string.length-1] = label;
+
+ if (a.isAccept(node.to)) {
+ // This transition leads to an accept state,
+ // so we save the current string:
+ results.add(IntsRef.deepCopyOf(string));
+ if (results.size() == limit) {
+ break;
+ }
+ }
+
+ if (a.getNumTransitions(node.to) != 0) {
+ // Now recurse: the destination of this transition has
+ // outgoing transitions:
+ if (pathStates.get(node.to)) {
+ throw new IllegalArgumentException("automaton has cycles");
+ }
+ pathStates.set(node.to);
+
+ // Push node onto stack:
+ if (nodes.length == string.length) {
+ PathNode[] newNodes = new PathNode[ArrayUtil.oversize(nodes.length+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
+ System.arraycopy(nodes, 0, newNodes, 0, nodes.length);
+ nodes = newNodes;
+ }
+ getNode(nodes, string.length).resetState(a, node.to);
+ string.length++;
+ string.grow(string.length);
+ }
+ } else {
+ // No more transitions leaving this state,
+ // pop/return back to previous state:
+ assert pathStates.get(node.state);
+ pathStates.clear(node.state);
+ string.length--;
+ }
+ }
+ }
+
+ return results;
+ }
+
+ /** Returns a new automaton accepting the same language with added
+ * transitions to a dead state so that from every state and every label
+ * there is a transition. */
+ static Automaton totalize(Automaton a) {
+ Automaton result = new Automaton();
+ int numStates = a.getNumStates();
+ for(int i=0;i maxi) {
+ result.addTransition(i, deadState, maxi, t.min-1);
+ }
+ if (t.max + 1 > maxi) {
+ maxi = t.max + 1;
+ }
+ }
+
+ if (maxi <= Character.MAX_CODE_POINT) {
+ result.addTransition(i, deadState, maxi, Character.MAX_CODE_POINT);
+ }
+ }
+
+ result.finishState();
+ return result;
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java
index bf5b4be42064..76a040a620ae 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java
@@ -361,8 +361,6 @@ enum Kind {
*/
public static final int NONE = 0x0000;
- private static boolean allow_mutation = false;
-
Kind kind;
RegExp exp1, exp2;
String s;
@@ -419,13 +417,13 @@ public RegExp(String s, int syntax_flags) throws IllegalArgumentException {
to = e.to;
b = null;
}
-
+
/**
* Constructs new Automaton
from this RegExp
. Same
* as toAutomaton(null)
(empty automaton map).
*/
public Automaton toAutomaton() {
- return toAutomatonAllowMutate(null, null);
+ return toAutomaton(null, null);
}
/**
@@ -439,7 +437,7 @@ public Automaton toAutomaton() {
*/
public Automaton toAutomaton(AutomatonProvider automaton_provider)
throws IllegalArgumentException {
- return toAutomatonAllowMutate(null, automaton_provider);
+ return toAutomaton(null, automaton_provider);
}
/**
@@ -454,32 +452,9 @@ public Automaton toAutomaton(AutomatonProvider automaton_provider)
*/
public Automaton toAutomaton(Map automata)
throws IllegalArgumentException {
- return toAutomatonAllowMutate(automata, null);
- }
-
- /**
- * Sets or resets allow mutate flag. If this flag is set, then automata
- * construction uses mutable automata, which is slightly faster but not thread
- * safe. By default, the flag is not set.
- *
- * @param flag if true, the flag is set
- * @return previous value of the flag
- */
- public boolean setAllowMutate(boolean flag) {
- boolean b = allow_mutation;
- allow_mutation = flag;
- return b;
- }
-
- private Automaton toAutomatonAllowMutate(Map automata,
- AutomatonProvider automaton_provider) throws IllegalArgumentException {
- boolean b = false;
- if (allow_mutation) b = Automaton.setAllowMutate(true); // thread unsafe
- Automaton a = toAutomaton(automata, automaton_provider);
- if (allow_mutation) Automaton.setAllowMutate(b);
- return a;
+ return toAutomaton(automata, null);
}
-
+
private Automaton toAutomaton(Map automata,
AutomatonProvider automaton_provider) throws IllegalArgumentException {
List list;
@@ -489,8 +464,8 @@ private Automaton toAutomaton(Map automata,
list = new ArrayList<>();
findLeaves(exp1, Kind.REGEXP_UNION, list, automata, automaton_provider);
findLeaves(exp2, Kind.REGEXP_UNION, list, automata, automaton_provider);
- a = BasicOperations.union(list);
- MinimizationOperations.minimize(a);
+ a = Operations.union(list);
+ a = MinimizationOperations.minimize(a);
break;
case REGEXP_CONCATENATION:
list = new ArrayList<>();
@@ -498,66 +473,72 @@ private Automaton toAutomaton(Map automata,
automaton_provider);
findLeaves(exp2, Kind.REGEXP_CONCATENATION, list, automata,
automaton_provider);
- a = BasicOperations.concatenate(list);
- MinimizationOperations.minimize(a);
+ a = Operations.concatenate(list);
+ a = MinimizationOperations.minimize(a);
break;
case REGEXP_INTERSECTION:
- a = exp1.toAutomaton(automata, automaton_provider).intersection(
+ a = Operations.intersection(
+ exp1.toAutomaton(automata, automaton_provider),
exp2.toAutomaton(automata, automaton_provider));
- MinimizationOperations.minimize(a);
+ a = MinimizationOperations.minimize(a);
break;
case REGEXP_OPTIONAL:
- a = exp1.toAutomaton(automata, automaton_provider).optional();
- MinimizationOperations.minimize(a);
+ a = Operations.optional(exp1.toAutomaton(automata, automaton_provider));
+ a = MinimizationOperations.minimize(a);
break;
case REGEXP_REPEAT:
- a = exp1.toAutomaton(automata, automaton_provider).repeat();
- MinimizationOperations.minimize(a);
+ a = Operations.repeat(exp1.toAutomaton(automata, automaton_provider));
+ a = MinimizationOperations.minimize(a);
break;
case REGEXP_REPEAT_MIN:
- a = exp1.toAutomaton(automata, automaton_provider).repeat(min);
- MinimizationOperations.minimize(a);
+ a = Operations.repeat(exp1.toAutomaton(automata, automaton_provider), min);
+ a = MinimizationOperations.minimize(a);
break;
case REGEXP_REPEAT_MINMAX:
- a = exp1.toAutomaton(automata, automaton_provider).repeat(min, max);
- MinimizationOperations.minimize(a);
+ a = Operations.repeat(exp1.toAutomaton(automata, automaton_provider), min, max);
+ a = MinimizationOperations.minimize(a);
break;
case REGEXP_COMPLEMENT:
- a = exp1.toAutomaton(automata, automaton_provider).complement();
- MinimizationOperations.minimize(a);
+ a = Operations.complement(exp1.toAutomaton(automata, automaton_provider));
+ a = MinimizationOperations.minimize(a);
break;
case REGEXP_CHAR:
- a = BasicAutomata.makeChar(c);
+ a = Automata.makeChar(c);
break;
case REGEXP_CHAR_RANGE:
- a = BasicAutomata.makeCharRange(from, to);
+ a = Automata.makeCharRange(from, to);
break;
case REGEXP_ANYCHAR:
- a = BasicAutomata.makeAnyChar();
+ a = Automata.makeAnyChar();
break;
case REGEXP_EMPTY:
- a = BasicAutomata.makeEmpty();
+ a = Automata.makeEmpty();
break;
case REGEXP_STRING:
- a = BasicAutomata.makeString(s);
+ a = Automata.makeString(s);
break;
case REGEXP_ANYSTRING:
- a = BasicAutomata.makeAnyString();
+ a = Automata.makeAnyString();
break;
case REGEXP_AUTOMATON:
Automaton aa = null;
- if (automata != null) aa = automata.get(s);
- if (aa == null && automaton_provider != null) try {
- aa = automaton_provider.getAutomaton(s);
- } catch (IOException e) {
- throw new IllegalArgumentException(e);
+ if (automata != null) {
+ aa = automata.get(s);
}
- if (aa == null) throw new IllegalArgumentException("'" + s
- + "' not found");
- a = aa.clone(); // always clone here (ignore allow_mutate)
+ if (aa == null && automaton_provider != null) {
+ try {
+ aa = automaton_provider.getAutomaton(s);
+ } catch (IOException e) {
+ throw new IllegalArgumentException(e);
+ }
+ }
+ if (aa == null) {
+ throw new IllegalArgumentException("'" + s + "' not found");
+ }
+ a = aa;
break;
case REGEXP_INTERVAL:
- a = BasicAutomata.makeInterval(min, max, digits);
+ a = Automata.makeInterval(min, max, digits);
break;
}
return a;
@@ -568,7 +549,9 @@ private void findLeaves(RegExp exp, Kind kind, List list,
if (exp.kind == kind) {
findLeaves(exp.exp1, kind, list, automata, automaton_provider);
findLeaves(exp.exp2, kind, list, automata, automaton_provider);
- } else list.add(exp.toAutomaton(automata, automaton_provider));
+ } else {
+ list.add(exp.toAutomaton(automata, automaton_provider));
+ }
}
/**
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java
index bbcadd3374a3..7c216321d4f3 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java
@@ -37,6 +37,7 @@
* @lucene.experimental
*/
public abstract class RunAutomaton {
+ final Automaton automaton;
final int maxInterval;
final int size;
final boolean[] accept;
@@ -65,10 +66,10 @@ public String toString() {
if (j + 1 < points.length) max = (points[j + 1] - 1);
else max = maxInterval;
b.append(" ");
- Transition.appendCharString(min, b);
+ Automaton.appendCharString(min, b);
if (min != max) {
b.append("-");
- Transition.appendCharString(max, b);
+ Automaton.appendCharString(max, b);
}
b.append(" -> ").append(k).append("\n");
}
@@ -110,7 +111,7 @@ public final int[] getCharIntervals() {
* Gets character class of given codepoint
*/
final int getCharClass(int c) {
- return SpecialOperations.findIndex(c, points);
+ return Operations.findIndex(c, points);
}
/**
@@ -121,23 +122,23 @@ final int getCharClass(int c) {
*/
public RunAutomaton(Automaton a, int maxInterval, boolean tableize) {
this.maxInterval = maxInterval;
- a.determinize();
+ a = Operations.determinize(a);
+ this.automaton = a;
points = a.getStartPoints();
- final State[] states = a.getNumberedStates();
- initial = a.initial.number;
- size = states.length;
+ initial = 0;
+ size = Math.max(1,a.getNumStates());
accept = new boolean[size];
transitions = new int[size * points.length];
- for (int n = 0; n < size * points.length; n++)
- transitions[n] = -1;
- for (State s : states) {
- int n = s.number;
- accept[n] = s.accept;
+ Arrays.fill(transitions, -1);
+ for (int n=0;n 1) {
- int d = (a + b) >>> 1;
- if (points[d] > c) b = d;
- else if (points[d] < c) a = d;
- else return d;
- }
- return a;
- }
-
- /**
- * Returns true if the language of this automaton is finite.
- */
- public static boolean isFinite(Automaton a) {
- if (a.isSingleton()) return true;
- return isFinite(a.initial, new BitSet(a.getNumberOfStates()), new BitSet(a.getNumberOfStates()));
- }
-
- /**
- * Checks whether there is a loop containing s. (This is sufficient since
- * there are never transitions to dead states.)
- */
- // TODO: not great that this is recursive... in theory a
- // large automata could exceed java's stack
- private static boolean isFinite(State s, BitSet path, BitSet visited) {
- path.set(s.number);
- for (Transition t : s.getTransitions())
- if (path.get(t.to.number) || (!visited.get(t.to.number) && !isFinite(t.to, path, visited))) return false;
- path.clear(s.number);
- visited.set(s.number);
- return true;
- }
-
- /**
- * Returns the longest string that is a prefix of all accepted strings and
- * visits each state at most once.
- *
- * @return common prefix
- */
- public static String getCommonPrefix(Automaton a) {
- if (a.isSingleton()) return a.singleton;
- StringBuilder b = new StringBuilder();
- HashSet visited = new HashSet<>();
- State s = a.initial;
- boolean done;
- do {
- done = true;
- visited.add(s);
- if (!s.accept && s.numTransitions() == 1) {
- Transition t = s.getTransitions().iterator().next();
- if (t.min == t.max && !visited.contains(t.to)) {
- b.appendCodePoint(t.min);
- s = t.to;
- done = false;
- }
- }
- } while (!done);
- return b.toString();
- }
-
- // TODO: this currently requites a determinized machine,
- // but it need not -- we can speed it up by walking the
- // NFA instead. it'd still be fail fast.
- public static BytesRef getCommonPrefixBytesRef(Automaton a) {
- if (a.isSingleton()) return new BytesRef(a.singleton);
- BytesRef ref = new BytesRef(10);
- HashSet visited = new HashSet<>();
- State s = a.initial;
- boolean done;
- do {
- done = true;
- visited.add(s);
- if (!s.accept && s.numTransitions() == 1) {
- Transition t = s.getTransitions().iterator().next();
- if (t.min == t.max && !visited.contains(t.to)) {
- ref.grow(++ref.length);
- ref.bytes[ref.length - 1] = (byte)t.min;
- s = t.to;
- done = false;
- }
- }
- } while (!done);
- return ref;
- }
-
- /**
- * Returns the longest string that is a suffix of all accepted strings and
- * visits each state at most once.
- *
- * @return common suffix
- */
- public static String getCommonSuffix(Automaton a) {
- if (a.isSingleton()) // if singleton, the suffix is the string itself.
- return a.singleton;
-
- // reverse the language of the automaton, then reverse its common prefix.
- Automaton r = a.clone();
- reverse(r);
- r.determinize();
- return new StringBuilder(SpecialOperations.getCommonPrefix(r)).reverse().toString();
- }
-
- public static BytesRef getCommonSuffixBytesRef(Automaton a) {
- if (a.isSingleton()) // if singleton, the suffix is the string itself.
- return new BytesRef(a.singleton);
-
- // reverse the language of the automaton, then reverse its common prefix.
- Automaton r = a.clone();
- reverse(r);
- r.determinize();
- BytesRef ref = SpecialOperations.getCommonPrefixBytesRef(r);
- reverseBytes(ref);
- return ref;
- }
-
- private static void reverseBytes(BytesRef ref) {
- if (ref.length <= 1) return;
- int num = ref.length >> 1;
- for (int i = ref.offset; i < ( ref.offset + num ); i++) {
- byte b = ref.bytes[i];
- ref.bytes[i] = ref.bytes[ref.offset * 2 + ref.length - i - 1];
- ref.bytes[ref.offset * 2 + ref.length - i - 1] = b;
- }
- }
-
- /**
- * Reverses the language of the given (non-singleton) automaton while returning
- * the set of new initial states.
- */
- public static Set reverse(Automaton a) {
- a.expandSingleton();
- // reverse all edges
- HashMap> m = new HashMap<>();
- State[] states = a.getNumberedStates();
- Set accept = new HashSet<>();
- for (State s : states)
- if (s.isAccept())
- accept.add(s);
- for (State r : states) {
- m.put(r, new HashSet());
- r.accept = false;
- }
- for (State r : states)
- for (Transition t : r.getTransitions())
- m.get(t.to).add(new Transition(t.min, t.max, r));
- for (State r : states) {
- Set tr = m.get(r);
- r.setTransitions(tr.toArray(new Transition[tr.size()]));
- }
- // make new initial+final states
- a.initial.accept = true;
- a.initial = new State();
- for (State r : accept)
- a.initial.addEpsilon(r); // ensures that all initial states are reachable
- a.deterministic = false;
- a.clearNumberedStates();
- return accept;
- }
-
- private static class PathNode {
-
- /** Which state the path node ends on, whose
- * transitions we are enumerating. */
- public State state;
-
- /** Which state the current transition leads to. */
- public State to;
-
- /** Which transition we are on. */
- public int transition;
-
- /** Which label we are on, in the min-max range of the
- * current Transition */
- public int label;
-
- public void resetState(State state) {
- assert state.numTransitions() != 0;
- this.state = state;
- transition = 0;
- Transition t = state.transitionsArray[transition];
- label = t.min;
- to = t.to;
- }
-
- /** Returns next label of current transition, or
- * advances to next transition and returns its first
- * label, if current one is exhausted. If there are
- * no more transitions, returns -1. */
- public int nextLabel() {
- if (label > state.transitionsArray[transition].max) {
- // We've exhaused the current transition's labels;
- // move to next transitions:
- transition++;
- if (transition >= state.numTransitions()) {
- // We're done iterating transitions leaving this state
- return -1;
- }
- Transition t = state.transitionsArray[transition];
- label = t.min;
- to = t.to;
- }
- return label++;
- }
- }
-
- private static PathNode getNode(PathNode[] nodes, int index) {
- assert index < nodes.length;
- if (nodes[index] == null) {
- nodes[index] = new PathNode();
- }
- return nodes[index];
- }
-
- // TODO: this is a dangerous method ... Automaton could be
- // huge ... and it's better in general for caller to
- // enumerate & process in a single walk:
-
- /** Returns the set of accepted strings, up to at most
- * limit
strings. If more than limit
- * strings are accepted, the first limit strings found are returned. If limit
== -1, then
- * the limit is infinite. If the {@link Automaton} has
- * cycles then this method might throw {@code
- * IllegalArgumentException} but that is not guaranteed
- * when the limit is set. */
- public static Set getFiniteStrings(Automaton a, int limit) {
- Set results = new HashSet<>();
-
- if (limit == -1 || limit > 0) {
- // OK
- } else {
- throw new IllegalArgumentException("limit must be -1 (which means no limit), or > 0; got: " + limit);
- }
-
- if (a.isSingleton()) {
- // Easy case: automaton accepts only 1 string
- results.add(Util.toUTF32(a.singleton, new IntsRef()));
- } else {
-
- if (a.initial.accept) {
- // Special case the empty string, as usual:
- results.add(new IntsRef());
- }
-
- if (a.initial.numTransitions() > 0 && (limit == -1 || results.size() < limit)) {
-
- // TODO: we could use state numbers here and just
- // alloc array, but asking for states array can be
- // costly (it's lazily computed):
-
- // Tracks which states are in the current path, for
- // cycle detection:
- Set pathStates = Collections.newSetFromMap(new IdentityHashMap());
-
- // Stack to hold our current state in the
- // recursion/iteration:
- PathNode[] nodes = new PathNode[4];
-
- pathStates.add(a.initial);
- PathNode root = getNode(nodes, 0);
- root.resetState(a.initial);
-
- IntsRef string = new IntsRef(1);
- string.length = 1;
-
- while (string.length > 0) {
-
- PathNode node = nodes[string.length-1];
-
- // Get next label leaving the current node:
- int label = node.nextLabel();
-
- if (label != -1) {
- string.ints[string.length-1] = label;
-
- if (node.to.accept) {
- // This transition leads to an accept state,
- // so we save the current string:
- results.add(IntsRef.deepCopyOf(string));
- if (results.size() == limit) {
- break;
- }
- }
-
- if (node.to.numTransitions() != 0) {
- // Now recurse: the destination of this transition has
- // outgoing transitions:
- if (pathStates.contains(node.to)) {
- throw new IllegalArgumentException("automaton has cycles");
- }
- pathStates.add(node.to);
-
- // Push node onto stack:
- if (nodes.length == string.length) {
- PathNode[] newNodes = new PathNode[ArrayUtil.oversize(nodes.length+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
- System.arraycopy(nodes, 0, newNodes, 0, nodes.length);
- nodes = newNodes;
- }
- getNode(nodes, string.length).resetState(node.to);
- string.length++;
- string.grow(string.length);
- }
- } else {
- // No more transitions leaving this state,
- // pop/return back to previous state:
- assert pathStates.contains(node.state);
- pathStates.remove(node.state);
- string.length--;
- }
- }
- }
- }
-
- return results;
- }
-}
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/State.java b/lucene/core/src/java/org/apache/lucene/util/automaton/State.java
deleted file mode 100644
index d1639e435778..000000000000
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/State.java
+++ /dev/null
@@ -1,280 +0,0 @@
-/*
- * dk.brics.automaton
- *
- * Copyright (c) 2001-2009 Anders Moeller
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. The name of the author may not be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
- * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-package org.apache.lucene.util.automaton;
-import org.apache.lucene.util.ArrayUtil;
-import org.apache.lucene.util.RamUsageEstimator;
-
-import java.util.Collection;
-import java.util.Comparator;
-import java.util.Iterator;
-
-/**
- * Automaton state.
- *
- * @lucene.experimental
- */
-public class State implements Comparable {
-
- boolean accept;
- public Transition[] transitionsArray;
- public int numTransitions;
-
- int number;
-
- int id;
- static int next_id;
-
- /**
- * Constructs a new state. Initially, the new state is a reject state.
- */
- public State() {
- resetTransitions();
- id = next_id++;
- }
-
- /**
- * Resets transition set.
- */
- final void resetTransitions() {
- transitionsArray = new Transition[0];
- numTransitions = 0;
- }
-
- private class TransitionsIterable implements Iterable {
- @Override
- public Iterator iterator() {
- return new Iterator() {
- int upto;
- @Override
- public boolean hasNext() {
- return upto < numTransitions;
- }
- @Override
- public Transition next() {
- return transitionsArray[upto++];
- }
- @Override
- public void remove() {
- throw new UnsupportedOperationException();
- }
- };
- }
- }
-
- /**
- * Returns the set of outgoing transitions. Subsequent changes are reflected
- * in the automaton.
- *
- * @return transition set
- */
- public Iterable getTransitions() {
- return new TransitionsIterable();
- }
-
- public int numTransitions() {
- return numTransitions;
- }
-
- public void setTransitions(Transition[] transitions) {
- this.numTransitions = transitions.length;
- this.transitionsArray = transitions;
- }
-
- /**
- * Adds an outgoing transition.
- *
- * @param t transition
- */
- public void addTransition(Transition t) {
- if (numTransitions == transitionsArray.length) {
- final Transition[] newArray = new Transition[ArrayUtil.oversize(1+numTransitions, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
- System.arraycopy(transitionsArray, 0, newArray, 0, numTransitions);
- transitionsArray = newArray;
- }
- transitionsArray[numTransitions++] = t;
- }
-
- /**
- * Sets acceptance for this state.
- *
- * @param accept if true, this state is an accept state
- */
- public void setAccept(boolean accept) {
- this.accept = accept;
- }
-
- /**
- * Returns acceptance status.
- *
- * @return true is this is an accept state
- */
- public boolean isAccept() {
- return accept;
- }
-
- /**
- * Performs lookup in transitions, assuming determinism.
- *
- * @param c codepoint to look up
- * @return destination state, null if no matching outgoing transition
- * @see #step(int, Collection)
- */
- public State step(int c) {
- assert c >= 0;
- for (int i=0;i dest) {
- for (int i=0;i max) max = t.max;
- } else {
- if (p != null) {
- transitionsArray[upto++] = new Transition(min, max, p);
- }
- min = t.min;
- max = t.max;
- }
- } else {
- if (p != null) {
- transitionsArray[upto++] = new Transition(min, max, p);
- }
- p = t.to;
- min = t.min;
- max = t.max;
- }
- }
-
- if (p != null) {
- transitionsArray[upto++] = new Transition(min, max, p);
- }
- numTransitions = upto;
- }
-
- /**
- * Returns sorted list of outgoing transitions.
- *
- * @param to_first if true, order by (to, min, reverse max); otherwise (min,
- * reverse max, to)
- * @return transition list
- */
-
- /** Sorts transitions array in-place. */
- public void sortTransitions(Comparator comparator) {
- // mergesort seems to perform better on already sorted arrays:
- if (numTransitions > 1) ArrayUtil.timSort(transitionsArray, 0, numTransitions, comparator);
- }
-
- /**
- * Return this state's number.
- *
- * Expert: Will be useless unless {@link Automaton#getNumberedStates}
- * has been called first to number the states.
- * @return the number
- */
- public int getNumber() {
- return number;
- }
-
- /**
- * Returns string describing this state. Normally invoked via
- * {@link Automaton#toString()}.
- */
- @Override
- public String toString() {
- StringBuilder b = new StringBuilder();
- b.append("state ").append(number);
- if (accept) b.append(" [accept]");
- else b.append(" [reject]");
- b.append(":\n");
- for (Transition t : getTransitions())
- b.append(" ").append(t.toString()).append("\n");
- return b.toString();
- }
-
- /**
- * Compares this object with the specified object for order. States are
- * ordered by the time of construction.
- */
- @Override
- public int compareTo(State s) {
- return s.id - id;
- }
-}
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/StatePair.java b/lucene/core/src/java/org/apache/lucene/util/automaton/StatePair.java
index 4124b9e8b930..4ce81ab35a9f 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/StatePair.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/StatePair.java
@@ -35,11 +35,11 @@
* @lucene.experimental
*/
public class StatePair {
- State s;
- State s1;
- State s2;
+ int s;
+ int s1;
+ int s2;
- StatePair(State s, State s1, State s2) {
+ StatePair(int s, int s1, int s2) {
this.s = s;
this.s1 = s1;
this.s2 = s2;
@@ -51,27 +51,10 @@ public class StatePair {
* @param s1 first state
* @param s2 second state
*/
- public StatePair(State s1, State s2) {
+ public StatePair(int s1, int s2) {
this.s1 = s1;
this.s2 = s2;
- }
-
- /**
- * Returns first component of this pair.
- *
- * @return first state
- */
- public State getFirstState() {
- return s1;
- }
-
- /**
- * Returns second component of this pair.
- *
- * @return second state
- */
- public State getSecondState() {
- return s2;
+ this.s = -1;
}
/**
@@ -96,6 +79,11 @@ public boolean equals(Object obj) {
*/
@Override
public int hashCode() {
- return s1.hashCode() + s2.hashCode();
+ return s1 ^ s2;
+ }
+
+ @Override
+ public String toString() {
+ return "StatePair(s1=" + s1 + " s2=" + s2 + ")";
}
}
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Transition.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Transition.java
index d22c6dbace74..fc5b6589a9bc 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/Transition.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Transition.java
@@ -1,214 +1,51 @@
+package org.apache.lucene.util.automaton;
+
/*
- * dk.brics.automaton
- *
- * Copyright (c) 2001-2009 Anders Moeller
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. The name of the author may not be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
- * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
*/
-package org.apache.lucene.util.automaton;
+/** Holds one transition from an {@link Automaton}. This is typically
+ * used temporarily when iterating through transitions by invoking
+ * {@link Automaton#initTransition} and {@link Automaton#getNextTransition}. */
-import java.util.Comparator;
+public class Transition {
-/**
- * Automaton transition.
- *
- * A transition, which belongs to a source state, consists of a Unicode
- * codepoint interval and a destination state.
- *
- * @lucene.experimental
- */
-public class Transition implements Cloneable {
-
- /*
- * CLASS INVARIANT: min<=max
- */
-
- final int min;
- final int max;
- final State to;
-
- /**
- * Constructs a new singleton interval transition.
- *
- * @param c transition codepoint
- * @param to destination state
- */
- public Transition(int c, State to) {
- assert c >= 0;
- min = max = c;
- this.to = to;
- }
-
- /**
- * Constructs a new transition. Both end points are included in the interval.
- *
- * @param min transition interval minimum
- * @param max transition interval maximum
- * @param to destination state
- */
- public Transition(int min, int max, State to) {
- assert min >= 0;
- assert max >= 0;
- if (max < min) {
- int t = max;
- max = min;
- min = t;
- }
- this.min = min;
- this.max = max;
- this.to = to;
- }
-
- /** Returns minimum of this transition interval. */
- public int getMin() {
- return min;
- }
-
- /** Returns maximum of this transition interval. */
- public int getMax() {
- return max;
- }
-
- /** Returns destination of this transition. */
- public State getDest() {
- return to;
- }
-
- /**
- * Checks for equality.
- *
- * @param obj object to compare with
- * @return true if obj is a transition with same character interval
- * and destination state as this transition.
- */
- @Override
- public boolean equals(Object obj) {
- if (obj instanceof Transition) {
- Transition t = (Transition) obj;
- return t.min == min && t.max == max && t.to == to;
- } else return false;
- }
-
- /**
- * Returns hash code. The hash code is based on the character interval (not
- * the destination state).
- *
- * @return hash code
- */
- @Override
- public int hashCode() {
- return min * 2 + max * 3;
- }
-
- /**
- * Clones this transition.
- *
- * @return clone with same character interval and destination state
- */
- @Override
- public Transition clone() {
- try {
- return (Transition) super.clone();
- } catch (CloneNotSupportedException e) {
- throw new RuntimeException(e);
- }
- }
-
- static void appendCharString(int c, StringBuilder b) {
- if (c >= 0x21 && c <= 0x7e && c != '\\' && c != '"') b.appendCodePoint(c);
- else {
- b.append("\\\\U");
- String s = Integer.toHexString(c);
- if (c < 0x10) b.append("0000000").append(s);
- else if (c < 0x100) b.append("000000").append(s);
- else if (c < 0x1000) b.append("00000").append(s);
- else if (c < 0x10000) b.append("0000").append(s);
- else if (c < 0x100000) b.append("000").append(s);
- else if (c < 0x1000000) b.append("00").append(s);
- else if (c < 0x10000000) b.append("0").append(s);
- else b.append(s);
- }
- }
-
- /**
- * Returns a string describing this state. Normally invoked via
- * {@link Automaton#toString()}.
- */
- @Override
- public String toString() {
- StringBuilder b = new StringBuilder();
- appendCharString(min, b);
- if (min != max) {
- b.append("-");
- appendCharString(max, b);
- }
- b.append(" -> ").append(to.number);
- return b.toString();
- }
-
- void appendDot(StringBuilder b) {
- b.append(" -> ").append(to.number).append(" [label=\"");
- appendCharString(min, b);
- if (min != max) {
- b.append("-");
- appendCharString(max, b);
- }
- b.append("\"]\n");
+ /** Sole constructor. */
+ public Transition() {
}
- private static final class CompareByDestThenMinMaxSingle implements Comparator {
- @Override
- public int compare(Transition t1, Transition t2) {
- if (t1.to != t2.to) {
- if (t1.to.number < t2.to.number) return -1;
- else if (t1.to.number > t2.to.number) return 1;
- }
- if (t1.min < t2.min) return -1;
- if (t1.min > t2.min) return 1;
- if (t1.max > t2.max) return -1;
- if (t1.max < t2.max) return 1;
- return 0;
- }
- }
+ /** Source state. */
+ public int source;
- public static final Comparator CompareByDestThenMinMax = new CompareByDestThenMinMaxSingle();
+ /** Destination state. */
+ public int dest;
- private static final class CompareByMinMaxThenDestSingle implements Comparator {
- @Override
- public int compare(Transition t1, Transition t2) {
- if (t1.min < t2.min) return -1;
- if (t1.min > t2.min) return 1;
- if (t1.max > t2.max) return -1;
- if (t1.max < t2.max) return 1;
- if (t1.to != t2.to) {
- if (t1.to.number < t2.to.number) return -1;
- if (t1.to.number > t2.to.number) return 1;
- }
- return 0;
- }
- }
+ /** Minimum accepted label (inclusive). */
+ public int min;
+
+ /** Maximum accepted label (inclusive). */
+ public int max;
- public static final Comparator CompareByMinMaxThenDest = new CompareByMinMaxThenDestSingle();
+ /** Remembers where we are in the iteration; init to -1 to provoke
+ * exception if nextTransition is called without first initTransition. */
+ int transitionUpto = -1;
+
+ @Override
+ public String toString() {
+ return source + " --> " + dest + " " + (char) min + "-" + (char) max;
+ }
}
+
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java b/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java
index 17be0ec152b2..059ee09b4bb3 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java
@@ -17,11 +17,9 @@
* limitations under the License.
*/
-import org.apache.lucene.util.RamUsageEstimator;
-import org.apache.lucene.util.ArrayUtil;
-
-import java.util.List;
+import java.util.Arrays;
import java.util.ArrayList;
+import java.util.List;
// TODO
// - do we really need the .bits...? if not we can make util in UnicodeUtil to convert 1 char into a BytesRef
@@ -122,6 +120,10 @@ public String toString() {
}
}
+ /** Sole constructor. */
+ public UTF32ToUTF8() {
+ }
+
private final UTF8Sequence startUTF8 = new UTF8Sequence();
private final UTF8Sequence endUTF8 = new UTF8Sequence();
@@ -129,37 +131,37 @@ public String toString() {
private final UTF8Sequence tmpUTF8b = new UTF8Sequence();
// Builds necessary utf8 edges between start & end
- void convertOneEdge(State start, State end, int startCodePoint, int endCodePoint) {
+ void convertOneEdge(int start, int end, int startCodePoint, int endCodePoint) {
startUTF8.set(startCodePoint);
endUTF8.set(endCodePoint);
- //System.out.println("start = " + startUTF8);
- //System.out.println(" end = " + endUTF8);
build(start, end, startUTF8, endUTF8, 0);
}
- private void build(State start, State end, UTF8Sequence startUTF8, UTF8Sequence endUTF8, int upto) {
+ private void build(int start, int end, UTF8Sequence startUTF8, UTF8Sequence endUTF8, int upto) {
// Break into start, middle, end:
if (startUTF8.byteAt(upto) == endUTF8.byteAt(upto)) {
// Degen case: lead with the same byte:
if (upto == startUTF8.len-1 && upto == endUTF8.len-1) {
// Super degen: just single edge, one UTF8 byte:
- start.addTransition(new Transition(startUTF8.byteAt(upto), endUTF8.byteAt(upto), end));
+ utf8.addTransition(start, end, startUTF8.byteAt(upto), endUTF8.byteAt(upto));
return;
} else {
assert startUTF8.len > upto+1;
assert endUTF8.len > upto+1;
- State n = newUTF8State();
+ int n = utf8.createState();
// Single value leading edge
- start.addTransition(new Transition(startUTF8.byteAt(upto), n)); // type=single
+ utf8.addTransition(start, n, startUTF8.byteAt(upto));
+ //start.addTransition(new Transition(startUTF8.byteAt(upto), n)); // type=single
// Recurse for the rest
build(n, end, startUTF8, endUTF8, 1+upto);
}
} else if (startUTF8.len == endUTF8.len) {
if (upto == startUTF8.len-1) {
- start.addTransition(new Transition(startUTF8.byteAt(upto), endUTF8.byteAt(upto), end)); // type=startend
+ //start.addTransition(new Transition(startUTF8.byteAt(upto), endUTF8.byteAt(upto), end)); // type=startend
+ utf8.addTransition(start, end, startUTF8.byteAt(upto), endUTF8.byteAt(upto));
} else {
start(start, end, startUTF8, upto, false);
if (endUTF8.byteAt(upto) - startUTF8.byteAt(upto) > 1) {
@@ -193,62 +195,69 @@ private void build(State start, State end, UTF8Sequence startUTF8, UTF8Sequence
}
}
- private void start(State start, State end, UTF8Sequence utf8, int upto, boolean doAll) {
- if (upto == utf8.len-1) {
+ private void start(int start, int end, UTF8Sequence startUTF8, int upto, boolean doAll) {
+ if (upto == startUTF8.len-1) {
// Done recursing
- start.addTransition(new Transition(utf8.byteAt(upto), utf8.byteAt(upto) | MASKS[utf8.numBits(upto)-1], end)); // type=start
+ utf8.addTransition(start, end, startUTF8.byteAt(upto), startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1]); // type=start
+ //start.addTransition(new Transition(startUTF8.byteAt(upto), startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1], end)); // type=start
} else {
- State n = newUTF8State();
- start.addTransition(new Transition(utf8.byteAt(upto), n)); // type=start
- start(n, end, utf8, 1+upto, true);
- int endCode = utf8.byteAt(upto) | MASKS[utf8.numBits(upto)-1];
- if (doAll && utf8.byteAt(upto) != endCode) {
- all(start, end, utf8.byteAt(upto)+1, endCode, utf8.len-upto-1);
+ int n = utf8.createState();
+ utf8.addTransition(start, n, startUTF8.byteAt(upto));
+ //start.addTransition(new Transition(startUTF8.byteAt(upto), n)); // type=start
+ start(n, end, startUTF8, 1+upto, true);
+ int endCode = startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1];
+ if (doAll && startUTF8.byteAt(upto) != endCode) {
+ all(start, end, startUTF8.byteAt(upto)+1, endCode, startUTF8.len-upto-1);
}
}
}
- private void end(State start, State end, UTF8Sequence utf8, int upto, boolean doAll) {
- if (upto == utf8.len-1) {
+ private void end(int start, int end, UTF8Sequence endUTF8, int upto, boolean doAll) {
+ if (upto == endUTF8.len-1) {
// Done recursing
- start.addTransition(new Transition(utf8.byteAt(upto) & (~MASKS[utf8.numBits(upto)-1]), utf8.byteAt(upto), end)); // type=end
+ //start.addTransition(new Transition(endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]), endUTF8.byteAt(upto), end)); // type=end
+ utf8.addTransition(start, end, endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]), endUTF8.byteAt(upto));
} else {
final int startCode;
- if (utf8.numBits(upto) == 5) {
- // special case -- avoid created unused edges (utf8
+ if (endUTF8.numBits(upto) == 5) {
+ // special case -- avoid created unused edges (endUTF8
// doesn't accept certain byte sequences) -- there
// are other cases we could optimize too:
startCode = 194;
} else {
- startCode = utf8.byteAt(upto) & (~MASKS[utf8.numBits(upto)-1]);
+ startCode = endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]);
}
- if (doAll && utf8.byteAt(upto) != startCode) {
- all(start, end, startCode, utf8.byteAt(upto)-1, utf8.len-upto-1);
+ if (doAll && endUTF8.byteAt(upto) != startCode) {
+ all(start, end, startCode, endUTF8.byteAt(upto)-1, endUTF8.len-upto-1);
}
- State n = newUTF8State();
- start.addTransition(new Transition(utf8.byteAt(upto), n)); // type=end
- end(n, end, utf8, 1+upto, true);
+ int n = utf8.createState();
+ //start.addTransition(new Transition(endUTF8.byteAt(upto), n)); // type=end
+ utf8.addTransition(start, n, endUTF8.byteAt(upto));
+ end(n, end, endUTF8, 1+upto, true);
}
}
- private void all(State start, State end, int startCode, int endCode, int left) {
+ private void all(int start, int end, int startCode, int endCode, int left) {
if (left == 0) {
- start.addTransition(new Transition(startCode, endCode, end)); // type=all
+ //start.addTransition(new Transition(startCode, endCode, end)); // type=all
+ utf8.addTransition(start, end, startCode, endCode);
} else {
- State lastN = newUTF8State();
- start.addTransition(new Transition(startCode, endCode, lastN)); // type=all
+ int lastN = utf8.createState();
+ //start.addTransition(new Transition(startCode, endCode, lastN)); // type=all
+ utf8.addTransition(start, lastN, startCode, endCode);
while (left > 1) {
- State n = newUTF8State();
- lastN.addTransition(new Transition(128, 191, n)); // type=all*
+ int n = utf8.createState();
+ //lastN.addTransition(new Transition(128, 191, n)); // type=all*
+ utf8.addTransition(lastN, n, 128, 191); // type=all*
left--;
lastN = n;
}
- lastN.addTransition(new Transition(128, 191, end)); // type = all*
+ //lastN.addTransition(new Transition(128, 191, end)); // type = all*
+ utf8.addTransition(lastN, end, 128, 191); // type = all*
}
}
- private State[] utf8States;
- private int utf8StateCount;
+ Automaton.Builder utf8;
/** Converts an incoming utf32 automaton to an equivalent
* utf8 one. The incoming automaton need not be
@@ -256,61 +265,49 @@ private void all(State start, State end, int startCode, int endCode, int left) {
* not in general be deterministic, so you must
* determinize it if that's needed. */
public Automaton convert(Automaton utf32) {
- if (utf32.isSingleton()) {
- utf32 = utf32.cloneExpanded();
+ if (utf32.getNumStates() == 0) {
+ return utf32;
}
- State[] map = new State[utf32.getNumberedStates().length];
- List pending = new ArrayList<>();
- State utf32State = utf32.getInitialState();
- pending.add(utf32State);
- Automaton utf8 = new Automaton();
- utf8.setDeterministic(false);
-
- State utf8State = utf8.getInitialState();
+ int[] map = new int[utf32.getNumStates()];
+ Arrays.fill(map, -1);
- utf8States = new State[5];
- utf8StateCount = 0;
- utf8State.number = utf8StateCount;
- utf8States[utf8StateCount] = utf8State;
- utf8StateCount++;
+ List pending = new ArrayList<>();
+ int utf32State = 0;
+ pending.add(utf32State);
+ utf8 = new Automaton.Builder();
+
+ int utf8State = utf8.createState();
- utf8State.setAccept(utf32State.isAccept());
+ utf8.setAccept(utf8State, utf32.isAccept(utf32State));
- map[utf32State.number] = utf8State;
+ map[utf32State] = utf8State;
+
+ Transition scratch = new Transition();
- while(pending.size() != 0) {
+ while (pending.size() != 0) {
utf32State = pending.remove(pending.size()-1);
- utf8State = map[utf32State.number];
- for(int i=0;i as = new ArrayList<>();
for(String s : strings) {
- as.add(BasicAutomata.makeString(s));
+ as.add(s2a(s));
as.add(SEP_A);
}
as.remove(as.size()-1);
- return BasicOperations.concatenate(as);
+ return Operations.concatenate(as);
}
private Automaton join(Automaton ... as) {
- return BasicOperations.concatenate(Arrays.asList(as));
+ return Operations.concatenate(Arrays.asList(as));
}
private Automaton s2a(String s) {
- return BasicAutomata.makeString(s);
+ return Automata.makeString(s);
}
public void testTwoTokens() throws Exception {
@@ -482,7 +485,8 @@ public void testTwoTokens() throws Exception {
final Automaton expected = join("abc", "def");
//toDot(actual);
- assertTrue(BasicOperations.sameLanguage(expected, actual));
+ assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)),
+ Operations.determinize(Operations.removeDeadStates(actual))));
}
public void testHole() throws Exception {
@@ -497,7 +501,8 @@ public void testHole() throws Exception {
final Automaton expected = join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def"));
//toDot(actual);
- assertTrue(BasicOperations.sameLanguage(expected, actual));
+ assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)),
+ Operations.determinize(Operations.removeDeadStates(actual))));
}
public void testOverlappedTokensSausage() throws Exception {
@@ -509,10 +514,11 @@ public void testOverlappedTokensSausage() throws Exception {
token("xyz", 0, 1)
});
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
- final Automaton a1 = BasicAutomata.makeString("abc");
- final Automaton a2 = BasicAutomata.makeString("xyz");
- final Automaton expected = BasicOperations.union(a1, a2);
- assertTrue(BasicOperations.sameLanguage(expected, actual));
+ final Automaton a1 = s2a("abc");
+ final Automaton a2 = s2a("xyz");
+ final Automaton expected = Operations.union(a1, a2);
+ assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)),
+ Operations.determinize(Operations.removeDeadStates(actual))));
}
public void testOverlappedTokensLattice() throws Exception {
@@ -524,12 +530,13 @@ public void testOverlappedTokensLattice() throws Exception {
token("def", 1, 1),
});
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
- final Automaton a1 = BasicAutomata.makeString("xyz");
+ final Automaton a1 = s2a("xyz");
final Automaton a2 = join("abc", "def");
- final Automaton expected = BasicOperations.union(a1, a2);
+ final Automaton expected = Operations.union(a1, a2);
//toDot(actual);
- assertTrue(BasicOperations.sameLanguage(expected, actual));
+ assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)),
+ Operations.determinize(Operations.removeDeadStates(actual))));
}
public void testSynOverHole() throws Exception {
@@ -541,13 +548,14 @@ public void testSynOverHole() throws Exception {
token("b", 2, 1),
});
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
- final Automaton a1 = BasicOperations.union(
+ final Automaton a1 = Operations.union(
join(s2a("a"), SEP_A, HOLE_A),
- BasicAutomata.makeString("X"));
- final Automaton expected = BasicOperations.concatenate(a1,
+ s2a("X"));
+ final Automaton expected = Operations.concatenate(a1,
join(SEP_A, s2a("b")));
//toDot(actual);
- assertTrue(BasicOperations.sameLanguage(expected, actual));
+ assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)),
+ Operations.determinize(Operations.removeDeadStates(actual))));
}
public void testSynOverHole2() throws Exception {
@@ -559,10 +567,11 @@ public void testSynOverHole2() throws Exception {
token("def", 2, 1),
});
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
- final Automaton expected = BasicOperations.union(
+ final Automaton expected = Operations.union(
join(s2a("xyz"), SEP_A, HOLE_A, SEP_A, s2a("def")),
- BasicAutomata.makeString("abc"));
- assertTrue(BasicOperations.sameLanguage(expected, actual));
+ s2a("abc"));
+ assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)),
+ Operations.determinize(Operations.removeDeadStates(actual))));
}
public void testOverlappedTokensLattice2() throws Exception {
@@ -575,11 +584,12 @@ public void testOverlappedTokensLattice2() throws Exception {
token("ghi", 1, 1),
});
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
- final Automaton a1 = BasicAutomata.makeString("xyz");
+ final Automaton a1 = s2a("xyz");
final Automaton a2 = join("abc", "def", "ghi");
- final Automaton expected = BasicOperations.union(a1, a2);
+ final Automaton expected = Operations.union(a1, a2);
//toDot(actual);
- assertTrue(BasicOperations.sameLanguage(expected, actual));
+ assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)),
+ Operations.determinize(Operations.removeDeadStates(actual))));
}
public void testToDot() throws Exception {
@@ -597,7 +607,8 @@ public void testStartsWithHole() throws Exception {
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
final Automaton expected = join(HOLE_A, SEP_A, s2a("abc"));
//toDot(actual);
- assertTrue(BasicOperations.sameLanguage(expected, actual));
+ assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)),
+ Operations.determinize(Operations.removeDeadStates(actual))));
}
// TODO: testEndsWithHole... but we need posInc to set in TS.end()
@@ -609,8 +620,9 @@ public void testSynHangingOverEnd() throws Exception {
token("X", 0, 10),
});
final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
- final Automaton expected = BasicOperations.union(BasicAutomata.makeString("a"),
- BasicAutomata.makeString("X"));
- assertTrue(BasicOperations.sameLanguage(expected, actual));
+ final Automaton expected = Operations.union(s2a("a"),
+ s2a("X"));
+ assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(expected)),
+ Operations.determinize(Operations.removeDeadStates(actual))));
}
}
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java b/lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java
index 18d5f8083c64..9d68ea864e36 100644
--- a/lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java
+++ b/lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java
@@ -34,10 +34,9 @@
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.TestUtil;
-import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.AutomatonTestUtil;
-import org.apache.lucene.util.automaton.BasicAutomata;
-import org.apache.lucene.util.automaton.BasicOperations;
+import org.apache.lucene.util.automaton.Automata;
+import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.RegExp;
@@ -165,9 +164,9 @@ public void testStop() throws Exception {
public void testKeep() throws Exception {
CharacterRunAutomaton keepWords =
new CharacterRunAutomaton(
- BasicOperations.complement(
- Automaton.union(
- Arrays.asList(BasicAutomata.makeString("foo"), BasicAutomata.makeString("bar")))));
+ Operations.complement(
+ Operations.union(
+ Arrays.asList(Automata.makeString("foo"), Automata.makeString("bar")))));
Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, keepWords);
assertAnalyzesTo(a, "quick foo brown bar bar fox foo",
new String[] { "foo", "bar", "bar", "foo" },
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java
index 03881d91b9d0..533cc7222d70 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java
@@ -84,9 +84,9 @@
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.ThreadInterruptedException;
import org.apache.lucene.util.Version;
-import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.BasicAutomata;
+import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.packed.PackedInts;
import org.junit.Test;
@@ -2006,7 +2006,7 @@ protected TokenStreamComponents createComponents(String fieldName) {
public void testStopwordsPosIncHole2() throws Exception {
// use two stopfilters for testing here
Directory dir = newDirectory();
- final Automaton secondSet = BasicAutomata.makeString("foobar");
+ final Automaton secondSet = Automata.makeString("foobar");
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java b/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java
index ecf4e9b05ca3..9acc4fd3b5ff 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java
@@ -33,9 +33,9 @@
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
-import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.BasicAutomata;
+import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.CompiledAutomaton;
+import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.RegExp;
@SuppressCodecs({ "SimpleText", "Memory", "Direct" })
@@ -244,7 +244,7 @@ public void testIntersectRandom() throws IOException {
if (VERBOSE) {
System.out.println("\nTEST: empty automaton");
}
- a = BasicAutomata.makeEmpty();
+ a = Automata.makeEmpty();
} else {
if (VERBOSE) {
System.out.println("\nTEST: keepPct=" + keepPct);
@@ -259,16 +259,9 @@ public void testIntersectRandom() throws IOException {
acceptTerms.add(s2);
sortedAcceptTerms.add(new BytesRef(s2));
}
- a = BasicAutomata.makeStringUnion(sortedAcceptTerms);
+ a = Automata.makeStringUnion(sortedAcceptTerms);
}
- if (random().nextBoolean()) {
- if (VERBOSE) {
- System.out.println("TEST: reduce the automaton");
- }
- a.reduce();
- }
-
final CompiledAutomaton c = new CompiledAutomaton(a, true, false);
final BytesRef[] acceptTermsArray = new BytesRef[acceptTerms.size()];
@@ -745,7 +738,7 @@ public void testIntersectBasic() throws Exception {
w.shutdown();
AtomicReader sub = getOnlySegmentReader(r);
Terms terms = sub.fields().terms("field");
- Automaton automaton = new RegExp(".*", RegExp.NONE).toAutomaton();
+ Automaton automaton = new RegExp(".*", RegExp.NONE).toAutomaton();
CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false);
TermsEnum te = terms.intersect(ca, null);
assertEquals("aaa", te.next().utf8ToString());
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum2.java b/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum2.java
index 55f8f0d7ce10..f261f16e3e7d 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum2.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum2.java
@@ -31,6 +31,7 @@
import org.apache.lucene.search.AutomatonQuery;
import org.apache.lucene.search.CheckHits;
import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
@@ -68,7 +69,7 @@ public void setUp() throws Exception {
writer.addDocument(doc);
}
- termsAutomaton = BasicAutomata.makeStringUnion(terms);
+ termsAutomaton = Automata.makeStringUnion(terms);
reader = writer.getReader();
searcher = newSearcher(reader);
@@ -84,23 +85,27 @@ public void tearDown() throws Exception {
/** tests a pre-intersected automaton against the original */
public void testFiniteVersusInfinite() throws Exception {
+
for (int i = 0; i < numIterations; i++) {
String reg = AutomatonTestUtil.randomRegexp(random());
- Automaton automaton = new RegExp(reg, RegExp.NONE).toAutomaton();
+ Automaton automaton = Operations.determinize(new RegExp(reg, RegExp.NONE).toAutomaton());
final List matchedTerms = new ArrayList<>();
for(BytesRef t : terms) {
- if (BasicOperations.run(automaton, t.utf8ToString())) {
+ if (Operations.run(automaton, t.utf8ToString())) {
matchedTerms.add(t);
}
}
- Automaton alternate = BasicAutomata.makeStringUnion(matchedTerms);
+ Automaton alternate = Automata.makeStringUnion(matchedTerms);
//System.out.println("match " + matchedTerms.size() + " " + alternate.getNumberOfStates() + " states, sigma=" + alternate.getStartPoints().length);
//AutomatonTestUtil.minimizeSimple(alternate);
//System.out.println("minmize done");
AutomatonQuery a1 = new AutomatonQuery(new Term("field", ""), automaton);
AutomatonQuery a2 = new AutomatonQuery(new Term("field", ""), alternate);
- CheckHits.checkEqual(a1, searcher.search(a1, 25).scoreDocs, searcher.search(a2, 25).scoreDocs);
+
+ ScoreDoc[] origHits = searcher.search(a1, 25).scoreDocs;
+ ScoreDoc[] newHits = searcher.search(a2, 25).scoreDocs;
+ CheckHits.checkEqual(a1, origHits, newHits);
}
}
@@ -108,13 +113,13 @@ public void testFiniteVersusInfinite() throws Exception {
public void testSeeking() throws Exception {
for (int i = 0; i < numIterations; i++) {
String reg = AutomatonTestUtil.randomRegexp(random());
- Automaton automaton = new RegExp(reg, RegExp.NONE).toAutomaton();
+ Automaton automaton = Operations.determinize(new RegExp(reg, RegExp.NONE).toAutomaton());
TermsEnum te = MultiFields.getTerms(reader, "field").iterator(null);
ArrayList unsortedTerms = new ArrayList<>(terms);
Collections.shuffle(unsortedTerms, random());
for (BytesRef term : unsortedTerms) {
- if (BasicOperations.run(automaton, term.utf8ToString())) {
+ if (Operations.run(automaton, term.utf8ToString())) {
// term is accepted
if (random().nextBoolean()) {
// seek exact
@@ -153,16 +158,16 @@ public void testIntersect() throws Exception {
for (int i = 0; i < numIterations; i++) {
String reg = AutomatonTestUtil.randomRegexp(random());
Automaton automaton = new RegExp(reg, RegExp.NONE).toAutomaton();
- CompiledAutomaton ca = new CompiledAutomaton(automaton, SpecialOperations.isFinite(automaton), false);
+ CompiledAutomaton ca = new CompiledAutomaton(automaton, Operations.isFinite(automaton), false);
TermsEnum te = MultiFields.getTerms(reader, "field").intersect(ca, null);
- Automaton expected = BasicOperations.intersection(termsAutomaton, automaton);
+ Automaton expected = Operations.determinize(Operations.intersection(termsAutomaton, automaton));
TreeSet found = new TreeSet<>();
while (te.next() != null) {
found.add(BytesRef.deepCopyOf(te.term()));
}
- Automaton actual = BasicAutomata.makeStringUnion(found);
- assertTrue(BasicOperations.sameLanguage(expected, actual));
+ Automaton actual = Operations.determinize(Automata.makeStringUnion(found));
+ assertTrue(Operations.sameLanguage(expected, actual));
}
}
}
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java
index 1887ed02767c..c1cc032797d9 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java
@@ -33,10 +33,10 @@
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.Rethrow;
import org.apache.lucene.util.TestUtil;
-import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.AutomatonTestUtil;
-import org.apache.lucene.util.automaton.BasicAutomata;
-import org.apache.lucene.util.automaton.BasicOperations;
+import org.apache.lucene.util.automaton.Automata;
+import org.apache.lucene.util.automaton.Operations;
+import org.apache.lucene.util.automaton.Automaton;
public class TestAutomatonQuery extends LuceneTestCase {
private Directory directory;
@@ -106,24 +106,24 @@ private void assertAutomatonHits(int expected, Automaton automaton)
/**
* Test some very simple automata.
*/
- public void testBasicAutomata() throws IOException {
- assertAutomatonHits(0, BasicAutomata.makeEmpty());
- assertAutomatonHits(0, BasicAutomata.makeEmptyString());
- assertAutomatonHits(2, BasicAutomata.makeAnyChar());
- assertAutomatonHits(3, BasicAutomata.makeAnyString());
- assertAutomatonHits(2, BasicAutomata.makeString("doc"));
- assertAutomatonHits(1, BasicAutomata.makeChar('a'));
- assertAutomatonHits(2, BasicAutomata.makeCharRange('a', 'b'));
- assertAutomatonHits(2, BasicAutomata.makeInterval(1233, 2346, 0));
- assertAutomatonHits(1, BasicAutomata.makeInterval(0, 2000, 0));
- assertAutomatonHits(2, BasicOperations.union(BasicAutomata.makeChar('a'),
- BasicAutomata.makeChar('b')));
- assertAutomatonHits(0, BasicOperations.intersection(BasicAutomata
- .makeChar('a'), BasicAutomata.makeChar('b')));
- assertAutomatonHits(1, BasicOperations.minus(BasicAutomata.makeCharRange('a', 'b'),
- BasicAutomata.makeChar('a')));
+ public void testAutomata() throws IOException {
+ assertAutomatonHits(0, Automata.makeEmpty());
+ assertAutomatonHits(0, Automata.makeEmptyString());
+ assertAutomatonHits(2, Automata.makeAnyChar());
+ assertAutomatonHits(3, Automata.makeAnyString());
+ assertAutomatonHits(2, Automata.makeString("doc"));
+ assertAutomatonHits(1, Automata.makeChar('a'));
+ assertAutomatonHits(2, Automata.makeCharRange('a', 'b'));
+ assertAutomatonHits(2, Automata.makeInterval(1233, 2346, 0));
+ assertAutomatonHits(1, Automata.makeInterval(0, 2000, 0));
+ assertAutomatonHits(2, Operations.union(Automata.makeChar('a'),
+ Automata.makeChar('b')));
+ assertAutomatonHits(0, Operations.intersection(Automata
+ .makeChar('a'), Automata.makeChar('b')));
+ assertAutomatonHits(1, Operations.minus(Automata.makeCharRange('a', 'b'),
+ Automata.makeChar('a')));
}
-
+
/**
* Test that a nondeterministic automaton works correctly. (It should will be
* determinized)
@@ -131,26 +131,27 @@ public void testBasicAutomata() throws IOException {
public void testNFA() throws IOException {
// accept this or three, the union is an NFA (two transitions for 't' from
// initial state)
- Automaton nfa = BasicOperations.union(BasicAutomata.makeString("this"),
- BasicAutomata.makeString("three"));
+ Automaton nfa = Operations.union(Automata.makeString("this"),
+ Automata.makeString("three"));
assertAutomatonHits(2, nfa);
}
public void testEquals() {
- AutomatonQuery a1 = new AutomatonQuery(newTerm("foobar"), BasicAutomata
+ AutomatonQuery a1 = new AutomatonQuery(newTerm("foobar"), Automata
.makeString("foobar"));
// reference to a1
AutomatonQuery a2 = a1;
// same as a1 (accepts the same language, same term)
- AutomatonQuery a3 = new AutomatonQuery(newTerm("foobar"), BasicOperations
- .concatenate(BasicAutomata.makeString("foo"), BasicAutomata
- .makeString("bar")));
+ AutomatonQuery a3 = new AutomatonQuery(newTerm("foobar"),
+ Operations.concatenate(
+ Automata.makeString("foo"),
+ Automata.makeString("bar")));
// different than a1 (same term, but different language)
- AutomatonQuery a4 = new AutomatonQuery(newTerm("foobar"), BasicAutomata
- .makeString("different"));
+ AutomatonQuery a4 = new AutomatonQuery(newTerm("foobar"),
+ Automata.makeString("different"));
// different than a1 (different term, same language)
- AutomatonQuery a5 = new AutomatonQuery(newTerm("blah"), BasicAutomata
- .makeString("foobar"));
+ AutomatonQuery a5 = new AutomatonQuery(newTerm("blah"),
+ Automata.makeString("foobar"));
assertEquals(a1.hashCode(), a2.hashCode());
assertEquals(a1, a2);
@@ -176,8 +177,7 @@ public void testEquals() {
* MultiTermQuery semantics.
*/
public void testRewriteSingleTerm() throws IOException {
- AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"), BasicAutomata
- .makeString("piece"));
+ AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"), Automata.makeString("piece"));
Terms terms = MultiFields.getTerms(searcher.getIndexReader(), FN);
assertTrue(aq.getTermsEnum(terms) instanceof SingleTermsEnum);
assertEquals(1, automatonQueryNrHits(aq));
@@ -188,10 +188,8 @@ public void testRewriteSingleTerm() throws IOException {
* MultiTermQuery semantics.
*/
public void testRewritePrefix() throws IOException {
- Automaton pfx = BasicAutomata.makeString("do");
- pfx.expandSingleton(); // expand singleton representation for testing
- Automaton prefixAutomaton = BasicOperations.concatenate(pfx, BasicAutomata
- .makeAnyString());
+ Automaton pfx = Automata.makeString("do");
+ Automaton prefixAutomaton = Operations.concatenate(pfx, Automata.makeAnyString());
AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"), prefixAutomaton);
Terms terms = MultiFields.getTerms(searcher.getIndexReader(), FN);
assertTrue(aq.getTermsEnum(terms) instanceof PrefixTermsEnum);
@@ -202,8 +200,7 @@ public void testRewritePrefix() throws IOException {
* Test handling of the empty language
*/
public void testEmptyOptimization() throws IOException {
- AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"), BasicAutomata
- .makeEmpty());
+ AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"), Automata.makeEmpty());
// not yet available: assertTrue(aq.getEnum(searcher.getIndexReader())
// instanceof EmptyTermEnum);
Terms terms = MultiFields.getTerms(searcher.getIndexReader(), FN);
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestDocTermOrdsRewriteMethod.java b/lucene/core/src/test/org/apache/lucene/search/TestDocTermOrdsRewriteMethod.java
index d80d703668bb..9dd845fdf7fb 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestDocTermOrdsRewriteMethod.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestDocTermOrdsRewriteMethod.java
@@ -78,7 +78,7 @@ public void setUp() throws Exception {
Collections.sort(terms);
System.out.println("UTF16 order:");
for(String s : terms) {
- System.out.println(" " + UnicodeUtil.toHexString(s));
+ System.out.println(" " + UnicodeUtil.toHexString(s) + " " + s);
}
}
@@ -115,7 +115,7 @@ public void testRegexps() throws Exception {
/** check that the # of hits is the same as if the query
* is run against the inverted index
*/
- protected void assertSame(String regexp) throws IOException {
+ protected void assertSame(String regexp) throws IOException {
RegexpQuery docValues = new RegexpQuery(new Term(fieldName, regexp), RegExp.NONE);
docValues.setRewriteMethod(new DocTermOrdsRewriteMethod());
RegexpQuery inverted = new RegexpQuery(new Term(fieldName, regexp), RegExp.NONE);
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java
index b276b894c6ab..327a003ef35c 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java
@@ -38,6 +38,21 @@
*/
public class TestFuzzyQuery extends LuceneTestCase {
+ public void testBasicPrefix() throws Exception {
+ Directory directory = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
+ addDoc("abc", writer);
+ IndexReader reader = writer.getReader();
+ IndexSearcher searcher = newSearcher(reader);
+ writer.shutdown();
+
+ FuzzyQuery query = new FuzzyQuery(new Term("field", "abc"), FuzzyQuery.defaultMaxEdits, 1);
+ ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(1, hits.length);
+ reader.close();
+ directory.close();
+ }
+
public void testFuzziness() throws Exception {
Directory directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java
index 05ea7a1249db..6ed31407824a 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java
@@ -27,10 +27,10 @@
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.automaton.Automata;
+import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.AutomatonProvider;
-import org.apache.lucene.util.automaton.BasicAutomata;
-import org.apache.lucene.util.automaton.BasicOperations;
import org.apache.lucene.util.automaton.RegExp;
/**
@@ -97,10 +97,10 @@ public void testRegexComplement() throws IOException {
public void testCustomProvider() throws IOException {
AutomatonProvider myProvider = new AutomatonProvider() {
// automaton that matches quick or brown
- private Automaton quickBrownAutomaton = BasicOperations.union(Arrays
- .asList(BasicAutomata.makeString("quick"),
- BasicAutomata.makeString("brown"),
- BasicAutomata.makeString("bob")));
+ private Automaton quickBrownAutomaton = Operations.union(Arrays
+ .asList(Automata.makeString("quick"),
+ Automata.makeString("brown"),
+ Automata.makeString("bob")));
@Override
public Automaton getAutomaton(String name) {
@@ -108,8 +108,7 @@ public Automaton getAutomaton(String name) {
else return null;
}
};
- RegexpQuery query = new RegexpQuery(newTerm(""), RegExp.ALL,
- myProvider);
+ RegexpQuery query = new RegexpQuery(newTerm(""), RegExp.ALL, myProvider);
assertEquals(1, searcher.search(query, 5).totalHits);
}
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestRegexpRandom2.java b/lucene/core/src/test/org/apache/lucene/search/TestRegexpRandom2.java
index f6f2be586f97..ead284b90223 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestRegexpRandom2.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestRegexpRandom2.java
@@ -40,9 +40,9 @@
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.UnicodeUtil;
-import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.AutomatonTestUtil;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.RegExp;
/**
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestWildcard.java b/lucene/core/src/test/org/apache/lucene/search/TestWildcard.java
index 5d65451714d4..195c3fdc5344 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestWildcard.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestWildcard.java
@@ -268,7 +268,7 @@ private void assertMatches(IndexSearcher searcher, Query q, int expectedMatches)
* Test that wild card queries are parsed to the correct type and are searched correctly.
* This test looks at both parsing and execution of wildcard queries.
* Although placed here, it also tests prefix queries, verifying that
- * prefix queries are not parsed into wild card queries, and viceversa.
+ * prefix queries are not parsed into wild card queries, and vice-versa.
*/
public void testParsingAndSearching() throws Exception {
String field = "content";
diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java
new file mode 100644
index 000000000000..fe0842fb6f7f
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java
@@ -0,0 +1,1065 @@
+package org.apache.lucene.util.automaton;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util.automaton.AutomatonTestUtil.RandomAcceptedStrings;
+import org.apache.lucene.util.fst.Util;
+
+public class TestAutomaton extends LuceneTestCase {
+
+ public void testBasic() throws Exception {
+ Automaton a = new Automaton();
+ int start = a.createState();
+ int x = a.createState();
+ int y = a.createState();
+ int end = a.createState();
+ a.setAccept(end, true);
+
+ a.addTransition(start, x, 'a', 'a');
+ a.addTransition(start, end, 'd', 'd');
+ a.addTransition(x, y, 'b', 'b');
+ a.addTransition(y, end, 'c', 'c');
+ a.finishState();
+ }
+
+ public void testReduceBasic() throws Exception {
+ Automaton a = new Automaton();
+ int start = a.createState();
+ int end = a.createState();
+ a.setAccept(end, true);
+ // Should collapse to a-b:
+ a.addTransition(start, end, 'a', 'a');
+ a.addTransition(start, end, 'b', 'b');
+ a.addTransition(start, end, 'm', 'm');
+ // Should collapse to x-y:
+ a.addTransition(start, end, 'x', 'x');
+ a.addTransition(start, end, 'y', 'y');
+
+ a.finishState();
+ assertEquals(3, a.getNumTransitions(start));
+ Transition scratch = new Transition();
+ a.initTransition(start, scratch);
+ a.getNextTransition(scratch);
+ assertEquals('a', scratch.min);
+ assertEquals('b', scratch.max);
+ a.getNextTransition(scratch);
+ assertEquals('m', scratch.min);
+ assertEquals('m', scratch.max);
+ a.getNextTransition(scratch);
+ assertEquals('x', scratch.min);
+ assertEquals('y', scratch.max);
+ }
+
+ public void testSameLanguage() throws Exception {
+ Automaton a1 = Automata.makeString("foobar");
+ Automaton a2 = Operations.removeDeadStates(Operations.concatenate(
+ Automata.makeString("foo"),
+ Automata.makeString("bar")));
+ assertTrue(Operations.sameLanguage(a1, a2));
+ }
+
+ public void testCommonPrefix() throws Exception {
+ Automaton a = Operations.concatenate(
+ Automata.makeString("foobar"),
+ Automata.makeAnyString());
+ assertEquals("foobar", Operations.getCommonPrefix(a));
+ }
+
+ public void testConcatenate1() throws Exception {
+ Automaton a = Operations.concatenate(
+ Automata.makeString("m"),
+ Automata.makeAnyString());
+ assertTrue(Operations.run(a, "m"));
+ assertTrue(Operations.run(a, "me"));
+ assertTrue(Operations.run(a, "me too"));
+ }
+
+ public void testConcatenate2() throws Exception {
+ Automaton a = Operations.concatenate(Arrays.asList(
+ Automata.makeString("m"),
+ Automata.makeAnyString(),
+ Automata.makeString("n"),
+ Automata.makeAnyString()));
+ a = Operations.determinize(a);
+ assertTrue(Operations.run(a, "mn"));
+ assertTrue(Operations.run(a, "mone"));
+ assertFalse(Operations.run(a, "m"));
+ assertFalse(Operations.isFinite(a));
+ }
+
+ public void testUnion1() throws Exception {
+ Automaton a = Operations.union(Arrays.asList(
+ Automata.makeString("foobar"),
+ Automata.makeString("barbaz")));
+ a = Operations.determinize(a);
+ assertTrue(Operations.run(a, "foobar"));
+ assertTrue(Operations.run(a, "barbaz"));
+
+ assertMatches(a, "foobar", "barbaz");
+ }
+
+ public void testUnion2() throws Exception {
+ Automaton a = Operations.union(Arrays.asList(
+ Automata.makeString("foobar"),
+ Automata.makeString(""),
+ Automata.makeString("barbaz")));
+ a = Operations.determinize(a);
+ assertTrue(Operations.run(a, "foobar"));
+ assertTrue(Operations.run(a, "barbaz"));
+ assertTrue(Operations.run(a, ""));
+
+ assertMatches(a, "", "foobar", "barbaz");
+ }
+
+ public void testMinimizeSimple() throws Exception {
+ Automaton a = Automata.makeString("foobar");
+ Automaton aMin = MinimizationOperations.minimize(a);
+
+ assertTrue(Operations.sameLanguage(a, aMin));
+ }
+
+ public void testMinimize2() throws Exception {
+ Automaton a = Operations.union(Arrays.asList(Automata.makeString("foobar"),
+ Automata.makeString("boobar")));
+ Automaton aMin = MinimizationOperations.minimize(a);
+ assertTrue(Operations.sameLanguage(Operations.determinize(Operations.removeDeadStates(a)), aMin));
+ }
+
+ public void testReverse() throws Exception {
+ Automaton a = Automata.makeString("foobar");
+ Automaton ra = Operations.reverse(a);
+ Automaton a2 = Operations.determinize(Operations.reverse(ra));
+
+ assertTrue(Operations.sameLanguage(a, a2));
+ }
+
+ public void testOptional() throws Exception {
+ Automaton a = Automata.makeString("foobar");
+ Automaton a2 = Operations.optional(a);
+ a2 = Operations.determinize(a2);
+
+ assertTrue(Operations.run(a, "foobar"));
+ assertFalse(Operations.run(a, ""));
+ assertTrue(Operations.run(a2, "foobar"));
+ assertTrue(Operations.run(a2, ""));
+ }
+
+ public void testRepeatAny() throws Exception {
+ Automaton a = Automata.makeString("zee");
+ Automaton a2 = Operations.determinize(Operations.repeat(a));
+ assertTrue(Operations.run(a2, ""));
+ assertTrue(Operations.run(a2, "zee"));
+ assertTrue(Operations.run(a2, "zeezee"));
+ assertTrue(Operations.run(a2, "zeezeezee"));
+ }
+
+ public void testRepeatMin() throws Exception {
+ Automaton a = Automata.makeString("zee");
+ Automaton a2 = Operations.determinize(Operations.repeat(a, 2));
+ assertFalse(Operations.run(a2, ""));
+ assertFalse(Operations.run(a2, "zee"));
+ assertTrue(Operations.run(a2, "zeezee"));
+ assertTrue(Operations.run(a2, "zeezeezee"));
+ }
+
+ public void testRepeatMinMax1() throws Exception {
+ Automaton a = Automata.makeString("zee");
+ Automaton a2 = Operations.determinize(Operations.repeat(a, 0, 2));
+ assertTrue(Operations.run(a2, ""));
+ assertTrue(Operations.run(a2, "zee"));
+ assertTrue(Operations.run(a2, "zeezee"));
+ assertFalse(Operations.run(a2, "zeezeezee"));
+ }
+
+ public void testRepeatMinMax2() throws Exception {
+ Automaton a = Automata.makeString("zee");
+ Automaton a2 = Operations.determinize(Operations.repeat(a, 2, 4));
+ assertFalse(Operations.run(a2, ""));
+ assertFalse(Operations.run(a2, "zee"));
+ assertTrue(Operations.run(a2, "zeezee"));
+ assertTrue(Operations.run(a2, "zeezeezee"));
+ assertTrue(Operations.run(a2, "zeezeezeezee"));
+ assertFalse(Operations.run(a2, "zeezeezeezeezee"));
+ }
+
+ public void testComplement() throws Exception {
+ Automaton a = Automata.makeString("zee");
+ Automaton a2 = Operations.determinize(Operations.complement(a));
+ assertTrue(Operations.run(a2, ""));
+ assertFalse(Operations.run(a2, "zee"));
+ assertTrue(Operations.run(a2, "zeezee"));
+ assertTrue(Operations.run(a2, "zeezeezee"));
+ }
+
+ public void testInterval() throws Exception {
+ Automaton a = Operations.determinize(Automata.makeInterval(17, 100, 3));
+ assertFalse(Operations.run(a, ""));
+ assertTrue(Operations.run(a, "017"));
+ assertTrue(Operations.run(a, "100"));
+ assertTrue(Operations.run(a, "073"));
+ }
+
+ public void testCommonSuffix() throws Exception {
+ Automaton a = new Automaton();
+ int init = a.createState();
+ int fini = a.createState();
+ a.setAccept(init, true);
+ a.setAccept(fini, true);
+ a.addTransition(init, fini, 'm');
+ a.addTransition(fini, fini, 'm');
+ a.finishState();
+ assertEquals(0, Operations.getCommonSuffixBytesRef(a).length);
+ }
+
+ public void testReverseRandom1() throws Exception {
+ int ITERS = atLeast(100);
+ for(int i=0;i allTrans = new ArrayList<>();
+ int numStates = a.getNumStates();
+ for(int s=0;s 0) {
+ mins = prefix.substring(mins.length()) + mins;
+ maxs = prefix.substring(maxs.length()) + maxs;
+ }
+ assertTrue(Operations.run(a, mins));
+ assertTrue(Operations.run(a, maxs));
+
+ for(int iter2=0;iter2<100;iter2++) {
+ int x = random().nextInt(2*max);
+ boolean expected = x >= min && x <= max;
+ String sx = Integer.toString(x);
+ if (sx.length() < digits) {
+ // Left-fill with 0s
+ sx = b.substring(sx.length()) + sx;
+ } else if (digits == 0) {
+ // Left-fill with random number of 0s:
+ int numZeros = random().nextInt(10);
+ StringBuilder sb = new StringBuilder();
+ for(int i=0;i expected = new HashSet<>();
+ for(String s : strings) {
+ IntsRef ints = new IntsRef();
+ expected.add(Util.toUTF32(s, ints));
+ }
+
+ assertEquals(expected, Operations.getFiniteStrings(Operations.determinize(a), -1));
+ }
+
+ public void testConcatenatePreservesDet() throws Exception {
+ Automaton a1 = Automata.makeString("foobar");
+ assertTrue(a1.isDeterministic());
+ Automaton a2 = Automata.makeString("baz");
+ assertTrue(a2.isDeterministic());
+ assertTrue((Operations.concatenate(Arrays.asList(a1, a2)).isDeterministic()));
+ }
+
+ public void testRemoveDeadStates() throws Exception {
+ Automaton a = Operations.concatenate(Arrays.asList(Automata.makeString("x"),
+ Automata.makeString("y")));
+ assertEquals(4, a.getNumStates());
+ a = Operations.removeDeadStates(a);
+ assertEquals(3, a.getNumStates());
+ }
+
+ public void testRemoveDeadStatesEmpty1() throws Exception {
+ Automaton a = new Automaton();
+ a.finishState();
+ assertTrue(Operations.isEmpty(a));
+ assertTrue(Operations.isEmpty(Operations.removeDeadStates(a)));
+ }
+
+ public void testRemoveDeadStatesEmpty2() throws Exception {
+ Automaton a = new Automaton();
+ a.finishState();
+ assertTrue(Operations.isEmpty(a));
+ assertTrue(Operations.isEmpty(Operations.removeDeadStates(a)));
+ }
+
+ public void testRemoveDeadStatesEmpty3() throws Exception {
+ Automaton a = new Automaton();
+ int init = a.createState();
+ int fini = a.createState();
+ a.addTransition(init, fini, 'a');
+ Automaton a2 = Operations.removeDeadStates(a);
+ assertEquals(0, a2.getNumStates());
+ }
+
+ public void testConcatEmpty() throws Exception {
+ // If you concat empty automaton to anything the result should still be empty:
+ Automaton a = Operations.concatenate(Automata.makeEmpty(),
+ Automata.makeString("foo"));
+ assertEquals(new HashSet(), Operations.getFiniteStrings(a, -1));
+
+ a = Operations.concatenate(Automata.makeString("foo"),
+ Automata.makeEmpty());
+ assertEquals(new HashSet(), Operations.getFiniteStrings(a, -1));
+ }
+
+ public void testSeemsNonEmptyButIsNot1() throws Exception {
+ Automaton a = new Automaton();
+ // Init state has a transition but doesn't lead to accept
+ int init = a.createState();
+ int s = a.createState();
+ a.addTransition(init, s, 'a');
+ a.finishState();
+ assertTrue(Operations.isEmpty(a));
+ }
+
+ public void testSeemsNonEmptyButIsNot2() throws Exception {
+ Automaton a = new Automaton();
+ int init = a.createState();
+ int s = a.createState();
+ a.addTransition(init, s, 'a');
+ // An orphan'd accept state
+ s = a.createState();
+ a.setAccept(s, true);
+ a.finishState();
+ assertTrue(Operations.isEmpty(a));
+ }
+
+ public void testSameLanguage1() throws Exception {
+ Automaton a = Automata.makeEmptyString();
+ Automaton a2 = Automata.makeEmptyString();
+ int state = a2.createState();
+ a2.addTransition(0, state, 'a');
+ a2.finishState();
+ assertTrue(Operations.sameLanguage(Operations.removeDeadStates(a),
+ Operations.removeDeadStates(a2)));
+ }
+
+ private Automaton randomNoOp(Automaton a) {
+ switch (random().nextInt(7)) {
+ case 0:
+ if (VERBOSE) {
+ System.out.println(" randomNoOp: determinize");
+ }
+ return Operations.determinize(a);
+ case 1:
+ if (VERBOSE) {
+ System.out.println(" randomNoOp: minimize");
+ }
+ return MinimizationOperations.minimize(a);
+ case 2:
+ if (VERBOSE) {
+ System.out.println(" randomNoOp: removeDeadStates");
+ }
+ return Operations.removeDeadStates(a);
+ case 3:
+ if (VERBOSE) {
+ System.out.println(" randomNoOp: reverse reverse");
+ }
+ a = Operations.reverse(a);
+ a = randomNoOp(a);
+ return Operations.reverse(a);
+ case 4:
+ if (VERBOSE) {
+ System.out.println(" randomNoOp: concat empty string");
+ }
+ return Operations.concatenate(a, Automata.makeEmptyString());
+ case 5:
+ if (VERBOSE) {
+ System.out.println(" randomNoOp: union empty automaton");
+ }
+ return Operations.union(a, Automata.makeEmpty());
+ case 6:
+ if (VERBOSE) {
+ System.out.println(" randomNoOp: do nothing!");
+ }
+ return a;
+ }
+ assert false;
+ return null;
+ }
+
+ private Automaton unionTerms(Collection terms) {
+ Automaton a;
+ if (random().nextBoolean()) {
+ if (VERBOSE) {
+ System.out.println("TEST: unionTerms: use union");
+ }
+ List as = new ArrayList<>();
+ for(BytesRef term : terms) {
+ as.add(Automata.makeString(term.utf8ToString()));
+ }
+ a = Operations.union(as);
+ } else {
+ if (VERBOSE) {
+ System.out.println("TEST: unionTerms: use makeStringUnion");
+ }
+ List termsList = new ArrayList<>(terms);
+ Collections.sort(termsList);
+ a = Automata.makeStringUnion(termsList);
+ }
+
+ return randomNoOp(a);
+ }
+
+ private String getRandomString() {
+ //return TestUtil.randomSimpleString(random());
+ return TestUtil.randomRealisticUnicodeString(random());
+ }
+
+ public void testRandomFinite() throws Exception {
+
+ int numTerms = atLeast(10);
+ int iters = atLeast(100);
+
+ if (VERBOSE) {
+ System.out.println("TEST: numTerms" + numTerms + " iters=" + iters);
+ }
+
+ Set terms = new HashSet<>();
+ while (terms.size() < numTerms) {
+ terms.add(new BytesRef(getRandomString()));
+ }
+
+ Automaton a = unionTerms(terms);
+ assertSame(terms, a);
+
+ for(int iter=0;iter newTerms = new HashSet<>();
+ BytesRef prefix = new BytesRef(getRandomString());
+ for(BytesRef term : terms) {
+ BytesRef newTerm = BytesRef.deepCopyOf(prefix);
+ newTerm.append(term);
+ newTerms.add(newTerm);
+ }
+ terms = newTerms;
+ boolean wasDeterministic1 = a.isDeterministic();
+ a = Operations.concatenate(Automata.makeString(prefix.utf8ToString()), a);
+ assertEquals(wasDeterministic1, a.isDeterministic());
+ }
+ break;
+
+ case 1:
+ // concatenate suffix
+ {
+ BytesRef suffix = new BytesRef(getRandomString());
+ if (VERBOSE) {
+ System.out.println(" op=concat suffix " + suffix);
+ }
+ Set newTerms = new HashSet<>();
+ for(BytesRef term : terms) {
+ BytesRef newTerm = BytesRef.deepCopyOf(term);
+ newTerm.append(suffix);
+ newTerms.add(newTerm);
+ }
+ terms = newTerms;
+ a = Operations.concatenate(a, Automata.makeString(suffix.utf8ToString()));
+ }
+ break;
+
+ case 2:
+ // determinize
+ if (VERBOSE) {
+ System.out.println(" op=determinize");
+ }
+ a = Operations.determinize(a);
+ assertTrue(a.isDeterministic());
+ break;
+
+ case 3:
+ if (VERBOSE) {
+ System.out.println(" op=minimize");
+ }
+ // minimize
+ a = MinimizationOperations.minimize(a);
+ break;
+
+ case 4:
+ // union
+ {
+ if (VERBOSE) {
+ System.out.println(" op=union");
+ }
+ Set newTerms = new HashSet<>();
+ int numNewTerms = random().nextInt(5);
+ while (newTerms.size() < numNewTerms) {
+ newTerms.add(new BytesRef(getRandomString()));
+ }
+ terms.addAll(newTerms);
+ Automaton newA = unionTerms(newTerms);
+ a = Operations.union(a, newA);
+ }
+ break;
+
+ case 5:
+ // optional
+ {
+ if (VERBOSE) {
+ System.out.println(" op=optional");
+ }
+ a = Operations.optional(a);
+ terms.add(new BytesRef());
+ }
+ break;
+
+ case 6:
+ // minus finite
+ {
+ if (VERBOSE) {
+ System.out.println(" op=minus finite");
+ }
+ if (terms.size() > 0) {
+ RandomAcceptedStrings rasl = new RandomAcceptedStrings(Operations.removeDeadStates(a));
+ Set toRemove = new HashSet<>();
+ int numToRemove = TestUtil.nextInt(random(), 1, (terms.size()+1)/2);
+ while (toRemove.size() < numToRemove) {
+ int[] ints = rasl.getRandomAcceptedString(random());
+ BytesRef term = new BytesRef(UnicodeUtil.newString(ints, 0, ints.length));
+ if (toRemove.contains(term) == false) {
+ toRemove.add(term);
+ }
+ }
+ for(BytesRef term : toRemove) {
+ boolean removed = terms.remove(term);
+ assertTrue(removed);
+ }
+ Automaton a2 = unionTerms(toRemove);
+ a = Operations.minus(a, a2);
+ }
+ }
+ break;
+
+ case 7:
+ {
+ // minus infinite
+ List as = new ArrayList<>();
+ int count = TestUtil.nextInt(random(), 1, 5);
+ Set prefixes = new HashSet<>();
+ while(prefixes.size() < count) {
+ // prefix is a leading ascii byte; we remove * from a
+ int prefix = random().nextInt(128);
+ prefixes.add(prefix);
+ }
+
+ if (VERBOSE) {
+ System.out.println(" op=minus infinite prefixes=" + prefixes);
+ }
+
+ for(int prefix : prefixes) {
+ // prefix is a leading ascii byte; we remove * from a
+ Automaton a2 = new Automaton();
+ int init = a2.createState();
+ int state = a2.createState();
+ a2.addTransition(init, state, prefix);
+ a2.setAccept(state, true);
+ a2.addTransition(state, state, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT);
+ a2.finishState();
+ as.add(a2);
+ Iterator it = terms.iterator();
+ while (it.hasNext()) {
+ BytesRef term = it.next();
+ if (term.length > 0 && (term.bytes[term.offset] & 0xFF) == prefix) {
+ it.remove();
+ }
+ }
+ }
+ Automaton a2 = randomNoOp(Operations.union(as));
+ a = Operations.minus(a, a2);
+ }
+ break;
+
+ case 8:
+ {
+ int count = TestUtil.nextInt(random(), 10, 20);
+ if (VERBOSE) {
+ System.out.println(" op=intersect infinite count=" + count);
+ }
+ // intersect infinite
+ List as = new ArrayList<>();
+
+ Set prefixes = new HashSet<>();
+ while(prefixes.size() < count) {
+ int prefix = random().nextInt(128);
+ prefixes.add(prefix);
+ }
+ if (VERBOSE) {
+ System.out.println(" prefixes=" + prefixes);
+ }
+
+ for(int prefix : prefixes) {
+ // prefix is a leading ascii byte; we retain * in a
+ Automaton a2 = new Automaton();
+ int init = a2.createState();
+ int state = a2.createState();
+ a2.addTransition(init, state, prefix);
+ a2.setAccept(state, true);
+ a2.addTransition(state, state, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT);
+ a2.finishState();
+ as.add(a2);
+ prefixes.add(prefix);
+ }
+
+ Automaton a2 = Operations.union(as);
+ if (random().nextBoolean()) {
+ a2 = Operations.determinize(a2);
+ } else if (random().nextBoolean()) {
+ a2 = MinimizationOperations.minimize(a2);
+ }
+ a = Operations.intersection(a, a2);
+
+ Iterator it = terms.iterator();
+ while (it.hasNext()) {
+ BytesRef term = it.next();
+ if (term.length == 0 || prefixes.contains(term.bytes[term.offset]&0xff) == false) {
+ if (VERBOSE) {
+ System.out.println(" drop term=" + term);
+ }
+ it.remove();
+ } else {
+ if (VERBOSE) {
+ System.out.println(" keep term=" + term);
+ }
+ }
+ }
+ }
+ break;
+
+ case 9:
+ // reverse
+ {
+ if (VERBOSE) {
+ System.out.println(" op=reverse");
+ }
+ a = Operations.reverse(a);
+ Set newTerms = new HashSet<>();
+ for(BytesRef term : terms) {
+ newTerms.add(new BytesRef(new StringBuilder(term.utf8ToString()).reverse().toString()));
+ }
+ terms = newTerms;
+ }
+ break;
+
+ case 10:
+ if (VERBOSE) {
+ System.out.println(" op=randomNoOp");
+ }
+ a = randomNoOp(a);
+ break;
+
+ case 11:
+ // interval
+ {
+ int min = random().nextInt(1000);
+ int max = min + random().nextInt(50);
+ // digits must be non-zero else we make cycle
+ int digits = Integer.toString(max).length();
+ if (VERBOSE) {
+ System.out.println(" op=union interval min=" + min + " max=" + max + " digits=" + digits);
+ }
+ a = Operations.union(a, Automata.makeInterval(min, max, digits));
+ StringBuilder b = new StringBuilder();
+ for(int i=0;i addTerms = new HashSet<>();
+ while (addTerms.size() < count) {
+ addTerms.add(new BytesRef(getRandomString()));
+ }
+ if (VERBOSE) {
+ for(BytesRef term : addTerms) {
+ System.out.println(" term=" + term);
+ }
+ }
+ Automaton a2 = unionTerms(addTerms);
+ Set newTerms = new HashSet<>();
+ if (random().nextBoolean()) {
+ // suffix
+ if (VERBOSE) {
+ System.out.println(" do suffix");
+ }
+ a = Operations.concatenate(a, randomNoOp(a2));
+ for(BytesRef term : terms) {
+ for(BytesRef suffix : addTerms) {
+ BytesRef newTerm = BytesRef.deepCopyOf(term);
+ newTerm.append(suffix);
+ newTerms.add(newTerm);
+ }
+ }
+ } else {
+ // prefix
+ if (VERBOSE) {
+ System.out.println(" do prefix");
+ }
+ a = Operations.concatenate(randomNoOp(a2), a);
+ for(BytesRef term : terms) {
+ for(BytesRef prefix : addTerms) {
+ BytesRef newTerm = BytesRef.deepCopyOf(prefix);
+ newTerm.append(term);
+ newTerms.add(newTerm);
+ }
+ }
+ }
+
+ terms = newTerms;
+ }
+ break;
+ }
+
+ // assertSame(terms, a);
+ assertEquals(AutomatonTestUtil.isDeterministicSlow(a), a.isDeterministic());
+ }
+
+ assertSame(terms, a);
+ }
+
+ private void assertSame(Collection terms, Automaton a) {
+
+ try {
+ assertTrue(Operations.isFinite(a));
+ assertFalse(Operations.isTotal(a));
+
+ Automaton detA = Operations.determinize(a);
+
+ // Make sure all terms are accepted:
+ IntsRef scratch = new IntsRef();
+ for(BytesRef term : terms) {
+ Util.toIntsRef(term, scratch);
+ assertTrue("failed to accept term=" + term.utf8ToString(), Operations.run(detA, term.utf8ToString()));
+ }
+
+ // Use getFiniteStrings:
+ Set expected = new HashSet<>();
+ for(BytesRef term : terms) {
+ IntsRef intsRef = new IntsRef();
+ Util.toUTF32(term.utf8ToString(), intsRef);
+ expected.add(intsRef);
+ }
+ Set actual = Operations.getFiniteStrings(a, -1);
+
+ if (expected.equals(actual) == false) {
+ System.out.println("FAILED:");
+ for(IntsRef term : expected) {
+ if (actual.contains(term) == false) {
+ System.out.println(" term=" + term + " should be accepted but isn't");
+ }
+ }
+ for(IntsRef term : actual) {
+ if (expected.contains(term) == false) {
+ System.out.println(" term=" + term + " is accepted but should not be");
+ }
+ }
+ throw new AssertionError("mismatch");
+ }
+
+ // Use sameLanguage:
+ Automaton a2 = Operations.removeDeadStates(Operations.determinize(unionTerms(terms)));
+ assertTrue(Operations.sameLanguage(a2, Operations.removeDeadStates(Operations.determinize(a))));
+
+ // Do same check, in UTF8 space
+ Automaton utf8 = randomNoOp(new UTF32ToUTF8().convert(a));
+
+ Set expected2 = new HashSet<>();
+ for(BytesRef term : terms) {
+ IntsRef intsRef = new IntsRef();
+ Util.toIntsRef(term, intsRef);
+ expected2.add(intsRef);
+ }
+ assertEquals(expected2, Operations.getFiniteStrings(utf8, -1));
+ } catch (AssertionError ae) {
+ System.out.println("TEST: FAILED: not same");
+ System.out.println(" terms (count=" + terms.size() + "):");
+ for(BytesRef term : terms) {
+ System.out.println(" " + term);
+ }
+ System.out.println(" automaton:");
+ System.out.println(a.toDot());
+ //a.writeDot("fail");
+ throw ae;
+ }
+ }
+}
diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestBasicOperations.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestBasicOperations.java
deleted file mode 100644
index 1b2e62df3c9e..000000000000
--- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestBasicOperations.java
+++ /dev/null
@@ -1,146 +0,0 @@
-package org.apache.lucene.util.automaton;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.util.*;
-
-import org.apache.lucene.util.*;
-
-import com.carrotsearch.randomizedtesting.generators.RandomInts;
-
-public class TestBasicOperations extends LuceneTestCase {
- /** Test string union. */
- public void testStringUnion() {
- List strings = new ArrayList<>();
- for (int i = RandomInts.randomIntBetween(random(), 0, 1000); --i >= 0;) {
- strings.add(new BytesRef(TestUtil.randomUnicodeString(random())));
- }
-
- Collections.sort(strings);
- Automaton union = BasicAutomata.makeStringUnion(strings);
- assertTrue(union.isDeterministic());
- assertTrue(BasicOperations.sameLanguage(union, naiveUnion(strings)));
- }
-
- private static Automaton naiveUnion(List strings) {
- Automaton [] eachIndividual = new Automaton [strings.size()];
- int i = 0;
- for (BytesRef bref : strings) {
- eachIndividual[i++] = BasicAutomata.makeString(bref.utf8ToString());
- }
- return BasicOperations.union(Arrays.asList(eachIndividual));
- }
-
- /** Test optimization to concatenate() */
- public void testSingletonConcatenate() {
- Automaton singleton = BasicAutomata.makeString("prefix");
- Automaton expandedSingleton = singleton.cloneExpanded();
- Automaton other = BasicAutomata.makeCharRange('5', '7');
- Automaton concat = BasicOperations.concatenate(singleton, other);
- assertTrue(concat.isDeterministic());
- assertTrue(BasicOperations.sameLanguage(BasicOperations.concatenate(expandedSingleton, other), concat));
- }
-
- /** Test optimization to concatenate() to an NFA */
- public void testSingletonNFAConcatenate() {
- Automaton singleton = BasicAutomata.makeString("prefix");
- Automaton expandedSingleton = singleton.cloneExpanded();
- // an NFA (two transitions for 't' from initial state)
- Automaton nfa = BasicOperations.union(BasicAutomata.makeString("this"),
- BasicAutomata.makeString("three"));
- Automaton concat = BasicOperations.concatenate(singleton, nfa);
- assertFalse(concat.isDeterministic());
- assertTrue(BasicOperations.sameLanguage(BasicOperations.concatenate(expandedSingleton, nfa), concat));
- }
-
- /** Test optimization to concatenate() with empty String */
- public void testEmptySingletonConcatenate() {
- Automaton singleton = BasicAutomata.makeString("");
- Automaton expandedSingleton = singleton.cloneExpanded();
- Automaton other = BasicAutomata.makeCharRange('5', '7');
- Automaton concat1 = BasicOperations.concatenate(expandedSingleton, other);
- Automaton concat2 = BasicOperations.concatenate(singleton, other);
- assertTrue(concat2.isDeterministic());
- assertTrue(BasicOperations.sameLanguage(concat1, concat2));
- assertTrue(BasicOperations.sameLanguage(other, concat1));
- assertTrue(BasicOperations.sameLanguage(other, concat2));
- }
-
- /** Test concatenation with empty language returns empty */
- public void testEmptyLanguageConcatenate() {
- Automaton a = BasicAutomata.makeString("a");
- Automaton concat = BasicOperations.concatenate(a, BasicAutomata.makeEmpty());
- assertTrue(BasicOperations.isEmpty(concat));
- }
-
- /** Test optimization to concatenate() with empty String to an NFA */
- public void testEmptySingletonNFAConcatenate() {
- Automaton singleton = BasicAutomata.makeString("");
- Automaton expandedSingleton = singleton.cloneExpanded();
- // an NFA (two transitions for 't' from initial state)
- Automaton nfa = BasicOperations.union(BasicAutomata.makeString("this"),
- BasicAutomata.makeString("three"));
- Automaton concat1 = BasicOperations.concatenate(expandedSingleton, nfa);
- Automaton concat2 = BasicOperations.concatenate(singleton, nfa);
- assertFalse(concat2.isDeterministic());
- assertTrue(BasicOperations.sameLanguage(concat1, concat2));
- assertTrue(BasicOperations.sameLanguage(nfa, concat1));
- assertTrue(BasicOperations.sameLanguage(nfa, concat2));
- }
-
- /** Test singletons work correctly */
- public void testSingleton() {
- Automaton singleton = BasicAutomata.makeString("foobar");
- Automaton expandedSingleton = singleton.cloneExpanded();
- assertTrue(BasicOperations.sameLanguage(singleton, expandedSingleton));
-
- singleton = BasicAutomata.makeString("\ud801\udc1c");
- expandedSingleton = singleton.cloneExpanded();
- assertTrue(BasicOperations.sameLanguage(singleton, expandedSingleton));
- }
-
- public void testGetRandomAcceptedString() throws Throwable {
- final int ITER1 = atLeast(100);
- final int ITER2 = atLeast(100);
- for(int i=0;i 0) {
- assertTrue(automata[n-1].subsetOf(automata[n]));
- assertTrue(automata[n-1].subsetOf(tautomata[n]));
- assertTrue(tautomata[n-1].subsetOf(automata[n]));
- assertTrue(tautomata[n-1].subsetOf(tautomata[n]));
+ assertTrue(Operations.subsetOf(Operations.removeDeadStates(automata[n-1]),
+ Operations.removeDeadStates(automata[n])));
+ assertTrue(Operations.subsetOf(Operations.removeDeadStates(automata[n-1]),
+ Operations.removeDeadStates(tautomata[n])));
+ assertTrue(Operations.subsetOf(Operations.removeDeadStates(tautomata[n-1]),
+ Operations.removeDeadStates(automata[n])));
+ assertTrue(Operations.subsetOf(Operations.removeDeadStates(tautomata[n-1]),
+ Operations.removeDeadStates(tautomata[n])));
assertNotSame(automata[n-1], automata[n]);
}
// check that Lev(N) is a subset of LevT(N)
- assertTrue(automata[n].subsetOf(tautomata[n]));
+ assertTrue(Operations.subsetOf(Operations.removeDeadStates(automata[n]),
+ Operations.removeDeadStates(tautomata[n])));
// special checks for specific n
switch(n) {
case 0:
// easy, matches the string itself
- assertTrue(BasicOperations.sameLanguage(BasicAutomata.makeString(s), automata[0]));
- assertTrue(BasicOperations.sameLanguage(BasicAutomata.makeString(s), tautomata[0]));
+ assertTrue(Operations.sameLanguage(Automata.makeString(s), Operations.removeDeadStates(automata[0])));
+ assertTrue(Operations.sameLanguage(Automata.makeString(s), Operations.removeDeadStates(tautomata[0])));
break;
case 1:
// generate a lev1 naively, and check the accepted lang is the same.
- assertTrue(BasicOperations.sameLanguage(naiveLev1(s), automata[1]));
- assertTrue(BasicOperations.sameLanguage(naiveLev1T(s), tautomata[1]));
+ assertTrue(Operations.sameLanguage(naiveLev1(s), Operations.removeDeadStates(automata[1])));
+ assertTrue(Operations.sameLanguage(naiveLev1T(s), Operations.removeDeadStates(tautomata[1])));
break;
default:
assertBruteForce(s, automata[n], n);
@@ -114,13 +119,13 @@ private void assertLev(String s, int maxDistance) {
* substitutions of s.
*/
private Automaton naiveLev1(String s) {
- Automaton a = BasicAutomata.makeString(s);
- a = BasicOperations.union(a, insertionsOf(s));
- MinimizationOperations.minimize(a);
- a = BasicOperations.union(a, deletionsOf(s));
- MinimizationOperations.minimize(a);
- a = BasicOperations.union(a, substitutionsOf(s));
- MinimizationOperations.minimize(a);
+ Automaton a = Automata.makeString(s);
+ a = Operations.union(a, insertionsOf(s));
+ a = MinimizationOperations.minimize(a);
+ a = Operations.union(a, deletionsOf(s));
+ a = MinimizationOperations.minimize(a);
+ a = Operations.union(a, substitutionsOf(s));
+ a = MinimizationOperations.minimize(a);
return a;
}
@@ -131,8 +136,8 @@ private Automaton naiveLev1(String s) {
*/
private Automaton naiveLev1T(String s) {
Automaton a = naiveLev1(s);
- a = BasicOperations.union(a, transpositionsOf(s));
- MinimizationOperations.minimize(a);
+ a = Operations.union(a, transpositionsOf(s));
+ a = MinimizationOperations.minimize(a);
return a;
}
@@ -144,15 +149,14 @@ private Automaton insertionsOf(String s) {
List list = new ArrayList<>();
for (int i = 0; i <= s.length(); i++) {
- Automaton a = BasicAutomata.makeString(s.substring(0, i));
- a = BasicOperations.concatenate(a, BasicAutomata.makeAnyChar());
- a = BasicOperations.concatenate(a, BasicAutomata.makeString(s
- .substring(i)));
+ Automaton a = Automata.makeString(s.substring(0, i));
+ a = Operations.concatenate(a, Automata.makeAnyChar());
+ a = Operations.concatenate(a, Automata.makeString(s.substring(i)));
list.add(a);
}
- Automaton a = BasicOperations.union(list);
- MinimizationOperations.minimize(a);
+ Automaton a = Operations.union(list);
+ a = MinimizationOperations.minimize(a);
return a;
}
@@ -164,15 +168,13 @@ private Automaton deletionsOf(String s) {
List list = new ArrayList<>();
for (int i = 0; i < s.length(); i++) {
- Automaton a = BasicAutomata.makeString(s.substring(0, i));
- a = BasicOperations.concatenate(a, BasicAutomata.makeString(s
- .substring(i + 1)));
- a.expandSingleton();
+ Automaton a = Automata.makeString(s.substring(0, i));
+ a = Operations.concatenate(a, Automata.makeString(s.substring(i + 1)));
list.add(a);
}
- Automaton a = BasicOperations.union(list);
- MinimizationOperations.minimize(a);
+ Automaton a = Operations.union(list);
+ a = MinimizationOperations.minimize(a);
return a;
}
@@ -184,15 +186,14 @@ private Automaton substitutionsOf(String s) {
List list = new ArrayList<>();
for (int i = 0; i < s.length(); i++) {
- Automaton a = BasicAutomata.makeString(s.substring(0, i));
- a = BasicOperations.concatenate(a, BasicAutomata.makeAnyChar());
- a = BasicOperations.concatenate(a, BasicAutomata.makeString(s
- .substring(i + 1)));
+ Automaton a = Automata.makeString(s.substring(0, i));
+ a = Operations.concatenate(a, Automata.makeAnyChar());
+ a = Operations.concatenate(a, Automata.makeString(s.substring(i + 1)));
list.add(a);
}
- Automaton a = BasicOperations.union(list);
- MinimizationOperations.minimize(a);
+ Automaton a = Operations.union(list);
+ a = MinimizationOperations.minimize(a);
return a;
}
@@ -201,8 +202,9 @@ private Automaton substitutionsOf(String s) {
* (transposing two adjacent characters)
*/
private Automaton transpositionsOf(String s) {
- if (s.length() < 2)
- return BasicAutomata.makeEmpty();
+ if (s.length() < 2) {
+ return Automata.makeEmpty();
+ }
List list = new ArrayList<>();
for (int i = 0; i < s.length()-1; i++) {
StringBuilder sb = new StringBuilder();
@@ -211,11 +213,12 @@ private Automaton transpositionsOf(String s) {
sb.append(s.charAt(i));
sb.append(s.substring(i+2, s.length()));
String st = sb.toString();
- if (!st.equals(s))
- list.add(BasicAutomata.makeString(st));
+ if (!st.equals(s)) {
+ list.add(Automata.makeString(st));
+ }
}
- Automaton a = BasicOperations.union(list);
- MinimizationOperations.minimize(a);
+ Automaton a = Operations.union(list);
+ a = MinimizationOperations.minimize(a);
return a;
}
diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestMinimize.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestMinimize.java
index e306253b0adf..82a2914f43bb 100644
--- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestMinimize.java
+++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestMinimize.java
@@ -24,13 +24,13 @@
*/
public class TestMinimize extends LuceneTestCase {
/** the minimal and non-minimal are compared to ensure they are the same. */
- public void test() {
+ public void testBasic() {
int num = atLeast(200);
for (int i = 0; i < num; i++) {
Automaton a = AutomatonTestUtil.randomAutomaton(random());
- Automaton b = a.clone();
- MinimizationOperations.minimize(b);
- assertTrue(BasicOperations.sameLanguage(a, b));
+ Automaton la = Operations.determinize(Operations.removeDeadStates(a));
+ Automaton lb = MinimizationOperations.minimize(a);
+ assertTrue(Operations.sameLanguage(la, lb));
}
}
@@ -41,12 +41,22 @@ public void testAgainstBrzozowski() {
int num = atLeast(200);
for (int i = 0; i < num; i++) {
Automaton a = AutomatonTestUtil.randomAutomaton(random());
- AutomatonTestUtil.minimizeSimple(a);
- Automaton b = a.clone();
- MinimizationOperations.minimize(b);
- assertTrue(BasicOperations.sameLanguage(a, b));
- assertEquals(a.getNumberOfStates(), b.getNumberOfStates());
- assertEquals(a.getNumberOfTransitions(), b.getNumberOfTransitions());
+ a = AutomatonTestUtil.minimizeSimple(a);
+ Automaton b = MinimizationOperations.minimize(a);
+ assertTrue(Operations.sameLanguage(a, b));
+ assertEquals(a.getNumStates(), b.getNumStates());
+ int numStates = a.getNumStates();
+
+ int sum1 = 0;
+ for(int s=0;s strings = new ArrayList<>();
+ for (int i = RandomInts.randomIntBetween(random(), 0, 1000); --i >= 0;) {
+ strings.add(new BytesRef(TestUtil.randomUnicodeString(random())));
+ }
+
+ Collections.sort(strings);
+ Automaton union = Automata.makeStringUnion(strings);
+ assertTrue(union.isDeterministic());
+ assertTrue(Operations.sameLanguage(union, naiveUnion(strings)));
+ }
+
+ private static Automaton naiveUnion(List strings) {
+ Automaton[] eachIndividual = new Automaton[strings.size()];
+ int i = 0;
+ for (BytesRef bref : strings) {
+ eachIndividual[i++] = Automata.makeString(bref.utf8ToString());
+ }
+ return Operations.determinize(Operations.union(Arrays.asList(eachIndividual)));
+ }
+
+ /** Test concatenation with empty language returns empty */
+ public void testEmptyLanguageConcatenate() {
+ Automaton a = Automata.makeString("a");
+ Automaton concat = Operations.concatenate(a, Automata.makeEmpty());
+ assertTrue(Operations.isEmpty(concat));
+ }
+
+ /** Test optimization to concatenate() with empty String to an NFA */
+ public void testEmptySingletonNFAConcatenate() {
+ Automaton singleton = Automata.makeString("");
+ Automaton expandedSingleton = singleton;
+ // an NFA (two transitions for 't' from initial state)
+ Automaton nfa = Operations.union(Automata.makeString("this"),
+ Automata.makeString("three"));
+ Automaton concat1 = Operations.concatenate(expandedSingleton, nfa);
+ Automaton concat2 = Operations.concatenate(singleton, nfa);
+ assertFalse(concat2.isDeterministic());
+ assertTrue(Operations.sameLanguage(Operations.determinize(concat1),
+ Operations.determinize(concat2)));
+ assertTrue(Operations.sameLanguage(Operations.determinize(nfa),
+ Operations.determinize(concat1)));
+ assertTrue(Operations.sameLanguage(Operations.determinize(nfa),
+ Operations.determinize(concat2)));
+ }
+
+ public void testGetRandomAcceptedString() throws Throwable {
+ final int ITER1 = atLeast(100);
+ final int ITER2 = atLeast(100);
+ for(int i=0;i getFiniteStrings(Automaton a, int limit, boolean testRecursive) {
- Set result = SpecialOperations.getFiniteStrings(a, limit);
+ Set result = Operations.getFiniteStrings(a, limit);
if (testRecursive) {
assertEquals(AutomatonTestUtil.getFiniteStringsRecursive(a, limit), result);
}
@@ -56,8 +128,8 @@ private Set getFiniteStrings(Automaton a, int limit, boolean testRecurs
* Basic test for getFiniteStrings
*/
public void testFiniteStringsBasic() {
- Automaton a = BasicOperations.union(BasicAutomata.makeString("dog"), BasicAutomata.makeString("duck"));
- MinimizationOperations.minimize(a);
+ Automaton a = Operations.union(Automata.makeString("dog"), Automata.makeString("duck"));
+ a = MinimizationOperations.minimize(a);
Set strings = getFiniteStrings(a, -1, true);
assertEquals(2, strings.size());
IntsRef dog = new IntsRef();
@@ -74,7 +146,7 @@ public void testFiniteStringsEatsStack() {
String bigString1 = new String(chars);
TestUtil.randomFixedLengthUnicodeString(random(), chars, 0, chars.length);
String bigString2 = new String(chars);
- Automaton a = BasicOperations.union(BasicAutomata.makeString(bigString1), BasicAutomata.makeString(bigString2));
+ Automaton a = Operations.union(Automata.makeString(bigString1), Automata.makeString(bigString2));
Set strings = getFiniteStrings(a, -1, false);
assertEquals(2, strings.size());
IntsRef scratch = new IntsRef();
@@ -92,10 +164,10 @@ public void testRandomFiniteStrings1() {
}
Set strings = new HashSet();
- List automata = new ArrayList();
+ List automata = new ArrayList<>();
for(int i=0;i actual = getFiniteStrings(a, -1, true);
@@ -158,7 +225,7 @@ private static String toString(IntsRef ints) {
public void testWithCycle() throws Exception {
try {
- SpecialOperations.getFiniteStrings(new RegExp("abc.*", RegExp.NONE).toAutomaton(), -1);
+ Operations.getFiniteStrings(new RegExp("abc.*", RegExp.NONE).toAutomaton(), -1);
fail("did not hit exception");
} catch (IllegalArgumentException iae) {
// expected
@@ -174,12 +241,12 @@ public void testRandomFiniteStrings2() {
try {
// Must pass a limit because the random automaton
// can accept MANY strings:
- SpecialOperations.getFiniteStrings(a, TestUtil.nextInt(random(), 1, 1000));
+ Operations.getFiniteStrings(a, TestUtil.nextInt(random(), 1, 1000));
// NOTE: cannot do this, because the method is not
// guaranteed to detect cycles when you have a limit
- //assertTrue(SpecialOperations.isFinite(a));
+ //assertTrue(Operations.isFinite(a));
} catch (IllegalArgumentException iae) {
- assertFalse(SpecialOperations.isFinite(a));
+ assertFalse(Operations.isFinite(a));
}
}
}
@@ -187,7 +254,7 @@ public void testRandomFiniteStrings2() {
public void testInvalidLimit() {
Automaton a = AutomatonTestUtil.randomAutomaton(random());
try {
- SpecialOperations.getFiniteStrings(a, -7);
+ Operations.getFiniteStrings(a, -7);
fail("did not hit exception");
} catch (IllegalArgumentException iae) {
// expected
@@ -197,7 +264,7 @@ public void testInvalidLimit() {
public void testInvalidLimit2() {
Automaton a = AutomatonTestUtil.randomAutomaton(random());
try {
- SpecialOperations.getFiniteStrings(a, 0);
+ Operations.getFiniteStrings(a, 0);
fail("did not hit exception");
} catch (IllegalArgumentException iae) {
// expected
@@ -205,7 +272,7 @@ public void testInvalidLimit2() {
}
public void testSingletonNoLimit() {
- Set result = SpecialOperations.getFiniteStrings(BasicAutomata.makeString("foobar"), -1);
+ Set result = Operations.getFiniteStrings(Automata.makeString("foobar"), -1);
assertEquals(1, result.size());
IntsRef scratch = new IntsRef();
Util.toUTF32("foobar".toCharArray(), 0, 6, scratch);
@@ -213,7 +280,7 @@ public void testSingletonNoLimit() {
}
public void testSingletonLimit1() {
- Set result = SpecialOperations.getFiniteStrings(BasicAutomata.makeString("foobar"), 1);
+ Set result = Operations.getFiniteStrings(Automata.makeString("foobar"), 1);
assertEquals(1, result.size());
IntsRef scratch = new IntsRef();
Util.toUTF32("foobar".toCharArray(), 0, 6, scratch);
diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestUTF32ToUTF8.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestUTF32ToUTF8.java
index d5faa4dfef90..95b19e049566 100644
--- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestUTF32ToUTF8.java
+++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestUTF32ToUTF8.java
@@ -17,13 +17,17 @@
* limitations under the License.
*/
+import java.nio.charset.StandardCharsets;
+import java.util.HashSet;
+import java.util.Random;
+import java.util.Set;
+
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
-import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.UnicodeUtil;
-
-import java.nio.charset.StandardCharsets;
-import java.util.Random;
+import org.apache.lucene.util.fst.Util;
public class TestUTF32ToUTF8 extends LuceneTestCase {
@@ -151,12 +155,7 @@ public void testRandomRanges() throws Exception {
continue;
}
- final Automaton a = new Automaton();
- final State end = new State();
- end.setAccept(true);
- a.getInitialState().addTransition(new Transition(startCode, endCode, end));
- a.setDeterministic(true);
-
+ Automaton a = Automata.makeCharRange(startCode, endCode);
testOne(r, new ByteRunAutomaton(a), startCode, endCode, ITERS_PER_DFA);
}
}
@@ -208,6 +207,20 @@ public void testRandomRegexes() throws Exception {
assertAutomaton(new RegExp(AutomatonTestUtil.randomRegexp(random()), RegExp.NONE).toAutomaton());
}
}
+
+ public void testSingleton() throws Exception {
+ int iters = atLeast(100);
+ for(int iter=0;iter set = new HashSet<>();
+ set.add(ints);
+ assertEquals(set, Operations.getFiniteStrings(utf8, -1));
+ }
+ }
private void assertAutomaton(Automaton automaton) throws Exception {
CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton);
diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java
index 26aaeb07d9b2..8c3f60cf74d4 100644
--- a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java
+++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java
@@ -17,6 +17,30 @@
* limitations under the License.
*/
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.io.StringWriter;
+import java.io.Writer;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Random;
+import java.util.Set;
+import java.util.TreeMap;
+import java.util.TreeSet;
+import java.util.concurrent.atomic.AtomicInteger;
+
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
@@ -40,12 +64,12 @@
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.LineFileDocs;
-import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.LuceneTestCase.Slow;
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
+import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
-import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.CompiledAutomaton;
+import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.RegExp;
import org.apache.lucene.util.fst.BytesRefFSTEnum.InputOutput;
import org.apache.lucene.util.fst.FST.Arc;
@@ -54,30 +78,6 @@
import org.apache.lucene.util.fst.Util.Result;
import org.apache.lucene.util.packed.PackedInts;
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.io.OutputStreamWriter;
-import java.io.StringWriter;
-import java.io.Writer;
-import java.nio.charset.StandardCharsets;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Locale;
-import java.util.Map;
-import java.util.Random;
-import java.util.Set;
-import java.util.TreeMap;
-import java.util.TreeSet;
-import java.util.concurrent.atomic.AtomicInteger;
-
import static org.apache.lucene.util.fst.FSTTester.getRandomString;
import static org.apache.lucene.util.fst.FSTTester.simpleRandomString;
import static org.apache.lucene.util.fst.FSTTester.toIntsRef;
@@ -346,7 +346,7 @@ public void testRealTerms() throws Exception {
BytesRef term;
int ord = 0;
- Automaton automaton = new RegExp(".*", RegExp.NONE).toAutomaton();
+ Automaton automaton = new RegExp(".*", RegExp.NONE).toAutomaton();
final TermsEnum termsEnum2 = terms.intersect(new CompiledAutomaton(automaton, false, false), null);
while((term = termsEnum.next()) != null) {
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/MultiTermHighlighting.java b/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/MultiTermHighlighting.java
index a06102578b10..bf2f1d2e85d1 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/MultiTermHighlighting.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/MultiTermHighlighting.java
@@ -46,11 +46,11 @@
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.UnicodeUtil;
-import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.BasicAutomata;
-import org.apache.lucene.util.automaton.BasicOperations;
+import org.apache.lucene.util.automaton.Automata;
+import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
+import org.apache.lucene.util.automaton.Automaton;
/**
* Support for highlighting multiterm queries in PostingsHighlighter.
@@ -106,8 +106,8 @@ public String toString() {
final PrefixQuery pq = (PrefixQuery) query;
Term prefix = pq.getPrefix();
if (prefix.field().equals(field)) {
- list.add(new CharacterRunAutomaton(BasicOperations.concatenate(BasicAutomata.makeString(prefix.text()),
- BasicAutomata.makeAnyString())) {
+ list.add(new CharacterRunAutomaton(Operations.concatenate(Automata.makeString(prefix.text()),
+ Automata.makeAnyString())) {
@Override
public String toString() {
return pq.toString();
@@ -126,11 +126,8 @@ public String toString() {
int prefixLength = Math.min(fq.getPrefixLength(), termLength);
String suffix = UnicodeUtil.newString(termText, prefixLength, termText.length - prefixLength);
LevenshteinAutomata builder = new LevenshteinAutomata(suffix, fq.getTranspositions());
- Automaton automaton = builder.toAutomaton(fq.getMaxEdits());
- if (prefixLength > 0) {
- Automaton prefix = BasicAutomata.makeString(UnicodeUtil.newString(termText, 0, prefixLength));
- automaton = BasicOperations.concatenate(prefix, automaton);
- }
+ String prefix = UnicodeUtil.newString(termText, 0, prefixLength);
+ Automaton automaton = builder.toAutomaton(fq.getMaxEdits(), prefix);
list.add(new CharacterRunAutomaton(automaton) {
@Override
public String toString() {
@@ -161,7 +158,7 @@ public String toString() {
final Comparator comparator = CharsRef.getUTF16SortedAsUTF8Comparator();
// this is *not* an automaton, but its very simple
- list.add(new CharacterRunAutomaton(BasicAutomata.makeEmpty()) {
+ list.add(new CharacterRunAutomaton(Automata.makeEmpty()) {
@Override
public boolean run(char[] s, int offset, int length) {
scratch.chars = s;
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
index 30b86f6267de..ee6d8b88b7ec 100644
--- a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
@@ -55,7 +55,7 @@
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.util.automaton.BasicAutomata;
+import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.RegExp;
import org.w3c.dom.Element;
@@ -1340,7 +1340,7 @@ public void testMaxSizeHighlightTruncates() throws Exception {
@Override
public void run() throws Exception {
String goodWord = "goodtoken";
- CharacterRunAutomaton stopWords = new CharacterRunAutomaton(BasicAutomata.makeString("stoppedtoken"));
+ CharacterRunAutomaton stopWords = new CharacterRunAutomaton(Automata.makeString("stoppedtoken"));
// we disable MockTokenizer checks because we will forcefully limit the
// tokenstream and call end() before incrementToken() returns false.
final MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords);
diff --git a/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java b/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java
index 48c33fe42341..acf07b6b5644 100644
--- a/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java
+++ b/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java
@@ -629,7 +629,7 @@ public void testRandom() throws Exception {
}
DocsEnum parents = MultiFields.getTermDocsEnum(joinR, null, "isParent", new BytesRef("x"));
System.out.println("parent docIDs:");
- while (parents.nextDoc() != parents.NO_MORE_DOCS) {
+ while (parents.nextDoc() != DocsEnum.NO_MORE_DOCS) {
System.out.println(" " + parents.docID());
}
}
diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/precedence/TestPrecedenceQueryParser.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/precedence/TestPrecedenceQueryParser.java
index a758a6ff2a8b..389bd0fc1f7f 100644
--- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/precedence/TestPrecedenceQueryParser.java
+++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/precedence/TestPrecedenceQueryParser.java
@@ -46,7 +46,7 @@
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.util.automaton.BasicAutomata;
+import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.junit.AfterClass;
import org.junit.BeforeClass;
@@ -557,7 +557,7 @@ public void testSimpleDAO() throws Exception {
}
public void testBoost() throws Exception {
- CharacterRunAutomaton stopSet = new CharacterRunAutomaton(BasicAutomata.makeString("on"));
+ CharacterRunAutomaton stopSet = new CharacterRunAutomaton(Automata.makeString("on"));
Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet);
PrecedenceQueryParser qp = new PrecedenceQueryParser();
diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestQPHelper.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestQPHelper.java
index abcf7a772741..5068210f33ae 100644
--- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestQPHelper.java
+++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestQPHelper.java
@@ -67,7 +67,7 @@
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.util.automaton.BasicAutomata;
+import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.RegExp;
import org.junit.AfterClass;
@@ -957,7 +957,7 @@ public void testSimpleDAO() throws Exception {
}
public void testBoost() throws Exception {
- CharacterRunAutomaton stopSet = new CharacterRunAutomaton(BasicAutomata.makeString("on"));
+ CharacterRunAutomaton stopSet = new CharacterRunAutomaton(Automata.makeString("on"));
Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet);
StandardQueryParser qp = new StandardQueryParser();
qp.setAnalyzer(oneStopAnalyzer);
diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java
index 613a63d804ef..f76c9e1a126c 100644
--- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java
+++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java
@@ -47,7 +47,7 @@
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.util.automaton.BasicAutomata;
+import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.RegExp;
import org.junit.AfterClass;
@@ -868,7 +868,7 @@ public void testSimpleDAO()
public void testBoost()
throws Exception {
- CharacterRunAutomaton stopWords = new CharacterRunAutomaton(BasicAutomata.makeString("on"));
+ CharacterRunAutomaton stopWords = new CharacterRunAutomaton(Automata.makeString("on"));
Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords);
CommonQueryParserConfiguration qp = getParserConfig(oneStopAnalyzer);
Query q = getQuery("on^1.0",qp);
diff --git a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsReader.java b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsReader.java
index b4e69e17fde3..3919b4540328 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsReader.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsReader.java
@@ -18,45 +18,25 @@
*/
import java.io.IOException;
-import java.io.PrintStream;
import java.util.Collections;
import java.util.Iterator;
import java.util.TreeMap;
-import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.index.CorruptIndexException;
-import org.apache.lucene.index.DocsAndPositionsEnum;
-import org.apache.lucene.index.DocsEnum;
-import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
-import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
-import org.apache.lucene.util.ArrayUtil;
-import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
-import org.apache.lucene.util.RamUsageEstimator;
-import org.apache.lucene.util.StringHelper;
-import org.apache.lucene.util.automaton.CompiledAutomaton;
-import org.apache.lucene.util.automaton.RunAutomaton;
-import org.apache.lucene.util.automaton.Transition;
-import org.apache.lucene.util.fst.ByteSequenceOutputs;
-import org.apache.lucene.util.fst.FST;
-import org.apache.lucene.util.fst.Outputs;
import org.apache.lucene.util.fst.PairOutputs.Pair;
-import org.apache.lucene.util.fst.PairOutputs;
-import org.apache.lucene.util.fst.Util;
/**
* See {@link VersionBlockTreeTermsWriter}.
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
index 483cbb8d8793..afae341b6e16 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
@@ -17,6 +17,16 @@
* limitations under the License.
*/
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Set;
+
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenStreamToAutomaton;
@@ -33,30 +43,19 @@
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.OfflineSorter;
import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.BasicOperations;
-import org.apache.lucene.util.automaton.SpecialOperations;
-import org.apache.lucene.util.automaton.State;
import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
-import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FST.BytesReader;
-import org.apache.lucene.util.fst.PairOutputs;
+import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PairOutputs.Pair;
+import org.apache.lucene.util.fst.PairOutputs;
import org.apache.lucene.util.fst.PositiveIntOutputs;
-import org.apache.lucene.util.fst.Util;
import org.apache.lucene.util.fst.Util.Result;
import org.apache.lucene.util.fst.Util.TopResults;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
+import org.apache.lucene.util.fst.Util;
/**
* Suggester that first analyzes the surface form, adds the
@@ -255,37 +254,64 @@ public long ramBytesUsed() {
return fst == null ? 0 : fst.ramBytesUsed();
}
- private void copyDestTransitions(State from, State to, List transitions) {
- if (to.isAccept()) {
- from.setAccept(true);
- }
- for(Transition t : to.getTransitions()) {
- transitions.add(t);
+ private int[] topoSortStates(Automaton a) {
+ int[] states = new int[a.getNumStates()];
+ final Set visited = new HashSet<>();
+ final LinkedList worklist = new LinkedList<>();
+ worklist.add(0);
+ visited.add(0);
+ int upto = 0;
+ states[upto] = 0;
+ upto++;
+ Transition t = new Transition();
+ while (worklist.size() > 0) {
+ int s = worklist.removeFirst();
+ int count = a.initTransition(s, t);
+ for (int i=0;i=0;stateNumber--) {
- final State state = states[stateNumber];
- List newTransitions = new ArrayList<>();
- for(Transition t : state.getTransitions()) {
- assert t.getMin() == t.getMax();
- if (t.getMin() == TokenStreamToAutomaton.POS_SEP) {
+ Transition t = new Transition();
+ int[] topoSortStates = topoSortStates(a);
+ for(int i=0;i lookup(final CharSequence key, Set contexts,
}
final BytesRef utf8Key = new BytesRef(key);
try {
-
Automaton lookupAutomaton = toLookupAutomaton(key);
final CharsRef spare = new CharsRef();
@@ -835,7 +862,7 @@ final Set toFiniteStrings(final BytesRef surfaceForm, final TokenStream
automaton = ts2a.toAutomaton(ts);
}
- replaceSep(automaton);
+ automaton = replaceSep(automaton);
automaton = convertAutomaton(automaton);
// TODO: LUCENE-5660 re-enable this once we disallow massive suggestion strings
@@ -848,7 +875,8 @@ final Set toFiniteStrings(final BytesRef surfaceForm, final TokenStream
// TODO: we could walk & add simultaneously, so we
// don't have to alloc [possibly biggish]
// intermediate HashSet in RAM:
- return SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions);
+
+ return Operations.getFiniteStrings(automaton, maxGraphExpansions);
}
final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
@@ -856,24 +884,16 @@ final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
// Turn tokenstream into automaton:
Automaton automaton = null;
try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) {
- automaton = (getTokenStreamToAutomaton()).toAutomaton(ts);
+ automaton = getTokenStreamToAutomaton().toAutomaton(ts);
}
- // TODO: we could use the end offset to "guess"
- // whether the final token was a partial token; this
- // would only be a heuristic ... but maybe an OK one.
- // This way we could eg differentiate "net" from "net ",
- // which we can't today...
-
- replaceSep(automaton);
+ automaton = replaceSep(automaton);
// TODO: we can optimize this somewhat by determinizing
// while we convert
- BasicOperations.determinize(automaton);
+ automaton = Operations.determinize(automaton);
return automaton;
}
-
-
/**
* Returns the weight associated with an input string,
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java
index b9e886f42245..ef6ea6034ca7 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java
@@ -17,13 +17,12 @@
* limitations under the License.
*/
+import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
-import java.io.IOException;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.State;
import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Util;
@@ -43,7 +42,7 @@ private FSTUtil() {
public static final class Path {
/** Node in the automaton where path ends: */
- public final State state;
+ public final int state;
/** Node in the FST where path ends: */
public final FST.Arc fstNode;
@@ -55,7 +54,7 @@ public static final class Path {
public final IntsRef input;
/** Sole constructor. */
- public Path(State state, FST.Arc