reverting some changes

dotnet · May 30, 2024 · 7858a2f · 7858a2f
1 parent 8e13e00
commit 7858a2f
Show file tree

Hide file tree

Showing 5 changed files with 24 additions and 17 deletions.
diff --git a/...stem.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs b/...stem.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
@@ -18,11 +18,7 @@ internal MatchingState(SymbolicRegexNode<TSet> node, uint prevCharKind)
         }
 
         /// <summary>
-        /// TODO: The CLR assigns an entire field for this byte which is a waste,
-        /// and the much more preferred way to use this is in _nullabilityArray in the matcher
-        /// but the current design relies on interfaces/flags and
-        /// using the MatchingState directly so this byte is a quick solution to cheapen
-        /// it there by ~30% as well without having to breaking it all to pieces
+        /// TODO: This is only used to speed up the existing architecture, ideally should be removed along with IsNullableFor
         /// </summary>
         internal readonly int NullabilityInfo;
 
@@ -106,8 +102,7 @@ internal SymbolicRegexNode<TSet> Next(SymbolicRegexBuilder<TSet> builder, TSet m
         }
 
         /// <summary>
-        /// TODO: This method should really never be used and
-        /// is only used to speed up the existing architecture.
+        /// TODO: This method is only used to speed up the existing architecture, ideally should be redesigned
         /// Use <see cref="SymbolicRegexMatcher{TSet}.IsNullableWithContext"/>
         /// whereever possible
         /// </summary>
@@ -166,7 +161,6 @@ internal StateFlags BuildStateFlags(bool isInitial)
         /// 00100 -> nullable for NewLine
         /// 01000 -> nullable for NewLineS
         /// 10000 -> nullable for WordLetter
-        /// todo: change to flags later
         /// </summary>
         /// <returns></returns>
         internal byte BuildNullabilityInfo()

diff --git a/....Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/....Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
@@ -47,9 +47,8 @@ public MintermClassifier(BDD[] minterms)
                 return;
             }
 
-            // low memory compromise is to create an ascii-only array
+            // ascii-only array to save memory
             // int mintermId = c >= 128 ? 0 : mtlookup[c];
-            // and only exists because the wasm tests fail with OOM
             _isAsciiOnly = true;
             for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
             {
@@ -60,8 +59,8 @@ public MintermClassifier(BDD[] minterms)
                 }
             }
 
-            // assign minterm category for every char
-            // unused characters in minterm 0 get mapped to zero
+            // i have never seen a regex use over 80 minterms not to speak of 255,
+            // but it's there as a fallback mechanism
             if (minterms.Length > 255)
             {
                 // over 255 unique sets also means it's never ascii only

diff --git a/...Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs b/...Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs
@@ -532,7 +532,7 @@ BDD MapCategoryCodeToCondition(UnicodeCategory code)
         // /// <summary>
         // /// attempt to remove anchors when possible since it reduces overhead
         // /// more rewrites could be tried but it's important to preserve PCRE semantics
-        // /// TODO: possibly removing this \b\w+\b != \w+ with due to zero width non-joiner
+        // /// TODO: possibly removing this \b\w+\b != \w+ due to zero width non-joiner
         // /// </summary>
         // /// <param name="builder"></param>
         // /// <param name="rootNode"></param>

diff --git a/...xt.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/...xt.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -81,7 +81,7 @@ internal sealed partial class SymbolicRegexMatcher<TSet> : SymbolicRegexMatcher
         /// <summary>Data and routines for skipping ahead to the next place a match could potentially start.</summary>
         private readonly RegexFindOptimizations? _findOpts;
 
-        /// <summary>Dead end state to quickly return NoMatch</summary>
+        /// <summary>Dead end state to quickly return NoMatch, this could potentially be a constant</summary>
         private readonly int _deadStateId;
 
         /// <summary>Whether the pattern contains any anchor</summary>
@@ -102,6 +102,9 @@ internal sealed partial class SymbolicRegexMatcher<TSet> : SymbolicRegexMatcher
         /// <remarks>If the pattern doesn't contain any anchors, there will only be a single initial state.</remarks>
         private readonly MatchingState<TSet>[] _reverseInitialStates;
 
+        /// <summary>
+        /// Reversal state which skips fixed length parts. Item1 - number of chars to skip; Item2 - adjusted reversal state.
+        /// </summary>
         private readonly (int, MatchingState<TSet>) _optimizedReversalState;
 
         /// <summary>Partition of the input space of sets.</summary>
@@ -328,6 +331,8 @@ uint CalculateMintermIdKind(int mintermId)
         /// </summary>
         internal PerThreadData CreatePerThreadData() => new PerThreadData(_capsize);
 
+        /// TODO: when you're calling a function millions of times per second even this add 1 does cost something
+        /// this should be ideally remapped
         /// <summary>Look up what is the character kind given a position ID</summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private uint GetPositionKind(int positionId) => _positionKinds[positionId + 1];
@@ -351,6 +356,7 @@ internal TSet GetMintermFromId(int mintermId)
             return minterms[mintermId];
         }
 
+        /// <summary>TODO: this if-else branch could be called once. it's currently causing overhead on every single step</summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private uint GetCharKind<TInputReader>(ReadOnlySpan<char> input, int i)
             where TInputReader : struct, IInputReader => !_pattern._info.ContainsSomeAnchor ?
@@ -657,6 +663,7 @@ private bool FindEndPositionDeltasDFANoSkipAscii(ReadOnlySpan<char> input, int l
         /// TODO: this is essentially a stripped down version when there's no good prefix optimizations
         /// i don't trust the compiler to optimize this and it makes a
         /// ~50% difference in performance with removing unnecessary checks alone
+        ///
         /// </summary>
         private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan<char> input, int lengthMinus1, RegexRunnerMode mode,
                 ref int posRef, int startStateId, ref int endPosRef, ref int endStateIdRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef)
@@ -668,9 +675,16 @@ private bool FindEndPositionDeltasDFANoSkip(ReadOnlySpan<char> input, int length
             byte[] mtlookup = _mintermClassifier.ByteLookup()!;
             int endStateId = endStateIdRef;
             int currStateId = startStateId;
+            // ldfld only once
+            // int deadStateId = _deadStateId;
             try
             {
                 // Loop through each character in the input, transitioning from state to state for each.
+                // The goal is to make this loop as fast as it can possible be,
+                // every single piece of overhead should be removed here
+                // there should be not a single callvirt instruction in the loop
+                // ldfld only if necessary (e.g. a reference changes)
+                // no memory writes unless necessary
                 while (true)
                 {
                     if (currStateId == _deadStateId)
@@ -783,7 +797,7 @@ private bool FindEndPositionDeltasDFA<TStateHandler, TInputReader, TFindOptimiza
 
                     // If the state is nullable for the next character, meaning it accepts the empty string,
                     // we found a potential end state.
-                    if (_nullabilityArray[state.DfaStateId] > 0 && TNullabilityHandler.IsNullableAt<TStateHandler>(this, in state, positionId, TStateHandler.GetStateFlags(this, in state)))
+                    if (TNullabilityHandler.IsNullableAt<TStateHandler>(this, in state, positionId, TStateHandler.GetStateFlags(this, in state)))
                     {
                         endPos = pos;
                         endStateId = TStateHandler.ExtractNullableCoreStateId(this, in state, input, pos);

diff --git a/...RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs b/...RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs
@@ -26,14 +26,14 @@ internal static class SymbolicRegexThresholds
         /// this should be a very last resort action, going from DFA mode to NFA mode turns 500MB/s to 5MB/s
         /// with an entirely different search-time algorithmic complexity
         /// 100_000 isn't a really a high memory cost either,
-        /// i'd even put 1_000_000 on the table but that might push it for general purpose use
+        /// ideally NFA mode should never be used, 1_000_000 is ok as well but it depends how much memory the user has
         /// </remarks>
         internal const int NfaThreshold = 100_000;
 
         /// <summary>
         /// Default maximum estimated safe expansion size of a <see cref="SymbolicRegexNode{TSet}"/> AST
         /// after the AST has been anlayzed for safe handling.
-        /// TODO: this is perhaps too conservative, consider raising this
+        /// TODO: this is perhaps too conservative, consider raising this, 5000 is ok even in safety critical scenarios, ~50 000 for general purpose is ok too
         /// <remarks>
         /// If the AST exceeds this threshold then <see cref="NotSupportedException"/> is thrown.
         /// This default value may be overridden with the AppContext data