Skip to content

Commit

Permalink
reverting some changes
Browse files Browse the repository at this point in the history
  • Loading branch information
ieviev committed May 30, 2024
1 parent 8e13e00 commit 7858a2f
Show file tree
Hide file tree
Showing 5 changed files with 24 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,7 @@ internal MatchingState(SymbolicRegexNode<TSet> node, uint prevCharKind)
}

/// <summary>
/// TODO: The CLR assigns an entire field for this byte which is a waste,
/// and the much more preferred way to use this is in _nullabilityArray in the matcher
/// but the current design relies on interfaces/flags and
/// using the MatchingState directly so this byte is a quick solution to cheapen
/// it there by ~30% as well without having to breaking it all to pieces
/// TODO: This is only used to speed up the existing architecture, ideally should be removed along with IsNullableFor
/// </summary>
internal readonly int NullabilityInfo;

Expand Down Expand Up @@ -106,8 +102,7 @@ internal SymbolicRegexNode<TSet> Next(SymbolicRegexBuilder<TSet> builder, TSet m
}

/// <summary>
/// TODO: This method should really never be used and
/// is only used to speed up the existing architecture.
/// TODO: This method is only used to speed up the existing architecture, ideally should be redesigned
/// Use <see cref="SymbolicRegexMatcher{TSet}.IsNullableWithContext"/>
/// whereever possible
/// </summary>
Expand Down Expand Up @@ -166,7 +161,6 @@ internal StateFlags BuildStateFlags(bool isInitial)
/// 00100 -> nullable for NewLine
/// 01000 -> nullable for NewLineS
/// 10000 -> nullable for WordLetter
/// todo: change to flags later
/// </summary>
/// <returns></returns>
internal byte BuildNullabilityInfo()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,8 @@ public MintermClassifier(BDD[] minterms)
return;
}

// low memory compromise is to create an ascii-only array
// ascii-only array to save memory
// int mintermId = c >= 128 ? 0 : mtlookup[c];
// and only exists because the wasm tests fail with OOM
_isAsciiOnly = true;
for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
{
Expand All @@ -60,8 +59,8 @@ public MintermClassifier(BDD[] minterms)
}
}

// assign minterm category for every char
// unused characters in minterm 0 get mapped to zero
// i have never seen a regex use over 80 minterms not to speak of 255,
// but it's there as a fallback mechanism
if (minterms.Length > 255)
{
// over 255 unique sets also means it's never ascii only
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -532,7 +532,7 @@ BDD MapCategoryCodeToCondition(UnicodeCategory code)
// /// <summary>
// /// attempt to remove anchors when possible since it reduces overhead
// /// more rewrites could be tried but it's important to preserve PCRE semantics
// /// TODO: possibly removing this \b\w+\b != \w+ with due to zero width non-joiner
// /// TODO: possibly removing this \b\w+\b != \w+ due to zero width non-joiner
// /// </summary>
// /// <param name="builder"></param>
// /// <param name="rootNode"></param>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ internal sealed partial class SymbolicRegexMatcher<TSet> : SymbolicRegexMatcher
/// <summary>Data and routines for skipping ahead to the next place a match could potentially start.</summary>
private readonly RegexFindOptimizations? _findOpts;

/// <summary>Dead end state to quickly return NoMatch</summary>
/// <summary>Dead end state to quickly return NoMatch, this could potentially be a constant</summary>
private readonly int _deadStateId;

/// <summary>Whether the pattern contains any anchor</summary>
Expand All @@ -102,6 +102,9 @@ internal sealed partial class SymbolicRegexMatcher<TSet> : SymbolicRegexMatcher
/// <remarks>If the pattern doesn't contain any anchors, there will only be a single initial state.</remarks>
private readonly MatchingState<TSet>[] _reverseInitialStates;

/// <summary>
/// Reversal state which skips fixed length parts. Item1 - number of chars to skip; Item2 - adjusted reversal state.
/// </summary>
private readonly (int, MatchingState<TSet>) _optimizedReversalState;

/// <summary>Partition of the input space of sets.</summary>
Expand Down Expand Up @@ -328,6 +331,8 @@ uint CalculateMintermIdKind(int mintermId)
/// </summary>
internal PerThreadData CreatePerThreadData() => new PerThreadData(_capsize);

/// TODO: when you're calling a function millions of times per second even this add 1 does cost something
/// this should be ideally remapped
/// <summary>Look up what is the character kind given a position ID</summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private uint GetPositionKind(int positionId) => _positionKinds[positionId + 1];
Expand All @@ -351,6 +356,7 @@ internal TSet GetMintermFromId(int mintermId)
return minterms[mintermId];
}

/// <summary>TODO: this if-else branch could be called once. it's currently causing overhead on every single step</summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private uint GetCharKind<TInputReader>(ReadOnlySpan<char> input, int i)
where TInputReader : struct, IInputReader => !_pattern._info.ContainsSomeAnchor ?
Expand Down Expand Up @@ -657,6 +663,7 @@ private bool FindEndPositionDeltasDFANoSkipAscii(ReadOnlySpan<char> input, int l
/// TODO: this is essentially a stripped down version when there's no good prefix optimizations
/// i don't trust the compiler to optimize this and it makes a
/// ~50% difference in performance with removing unnecessary checks alone
///
/// </summary>
private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan<char> input, int lengthMinus1, RegexRunnerMode mode,
ref int posRef, int startStateId, ref int endPosRef, ref int endStateIdRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef)
Expand All @@ -668,9 +675,16 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan<char> input, int length
byte[] mtlookup = _mintermClassifier.ByteLookup()!;
int endStateId = endStateIdRef;
int currStateId = startStateId;
// ldfld only once
// int deadStateId = _deadStateId;
try
{
// Loop through each character in the input, transitioning from state to state for each.
// The goal is to make this loop as fast as it can possible be,
// every single piece of overhead should be removed here
// there should be not a single callvirt instruction in the loop
// ldfld only if necessary (e.g. a reference changes)
// no memory writes unless necessary
while (true)
{
if (currStateId == _deadStateId)
Expand Down Expand Up @@ -783,7 +797,7 @@ private bool FindEndPositionDeltasDFA<TStateHandler, TInputReader, TFindOptimiza

// If the state is nullable for the next character, meaning it accepts the empty string,
// we found a potential end state.
if (_nullabilityArray[state.DfaStateId] > 0 && TNullabilityHandler.IsNullableAt<TStateHandler>(this, in state, positionId, TStateHandler.GetStateFlags(this, in state)))
if (TNullabilityHandler.IsNullableAt<TStateHandler>(this, in state, positionId, TStateHandler.GetStateFlags(this, in state)))
{
endPos = pos;
endStateId = TStateHandler.ExtractNullableCoreStateId(this, in state, input, pos);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,14 @@ internal static class SymbolicRegexThresholds
/// this should be a very last resort action, going from DFA mode to NFA mode turns 500MB/s to 5MB/s
/// with an entirely different search-time algorithmic complexity
/// 100_000 isn't a really a high memory cost either,
/// i'd even put 1_000_000 on the table but that might push it for general purpose use
/// ideally NFA mode should never be used, 1_000_000 is ok as well but it depends how much memory the user has
/// </remarks>
internal const int NfaThreshold = 100_000;

/// <summary>
/// Default maximum estimated safe expansion size of a <see cref="SymbolicRegexNode{TSet}"/> AST
/// after the AST has been anlayzed for safe handling.
/// TODO: this is perhaps too conservative, consider raising this
/// TODO: this is perhaps too conservative, consider raising this, 5000 is ok even in safety critical scenarios, ~50 000 for general purpose is ok too
/// <remarks>
/// If the AST exceeds this threshold then <see cref="NotSupportedException"/> is thrown.
/// This default value may be overridden with the AppContext data
Expand Down

0 comments on commit 7858a2f

Please sign in to comment.