Skip to content

Commit

Permalink
Review and clean up some code
Browse files Browse the repository at this point in the history
Simplification, style consistency, dead code deletion, some bounds-check removal, etc.
  • Loading branch information
stephentoub committed Jul 10, 2024
1 parent 0abaabe commit 6bb517a
Show file tree
Hide file tree
Showing 10 changed files with 370 additions and 437 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
using System.Collections.Generic;
using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;

#pragma warning disable CS8500 // takes address of managed type

Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,39 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

namespace System.Text.RegularExpressions.Symbolic;
using System.Diagnostics;

internal readonly struct MatchReversal<TSet>(
MatchReversalKind kind,
int fixedLength,
MatchingState<TSet>? adjustedStartState = null)
where TSet : IComparable<TSet>, IEquatable<TSet>
namespace System.Text.RegularExpressions.Symbolic
{
internal MatchReversalKind Kind { get; } = kind;
internal int FixedLength { get; } = fixedLength;
internal MatchingState<TSet>? AdjustedStartState { get; } = adjustedStartState;
/// <summary>Provides details on how a match may be processed in reverse to find the beginning of a match once a match's existence has been confirmed.</summary>
internal readonly struct MatchReversalInfo<TSet> where TSet : IComparable<TSet>, IEquatable<TSet>
{
/// <summary>Initializes the match reversal details.</summary>
internal MatchReversalInfo(MatchReversalKind kind, int fixedLength, MatchingState<TSet>? adjustedStartState = null)
{
Debug.Assert(kind is MatchReversalKind.MatchStart or MatchReversalKind.FixedLength or MatchReversalKind.PartialFixedLength);
Debug.Assert(fixedLength >= 0);
Debug.Assert((adjustedStartState is not null) == (kind is MatchReversalKind.PartialFixedLength));

Kind = kind;
FixedLength = fixedLength;
AdjustedStartState = adjustedStartState;
}

/// <summary>Gets the kind of the match reversal processing required.</summary>
internal MatchReversalKind Kind { get; }

/// <summary>Gets the fixed length of the match, if one is known.</summary>
/// <remarks>
/// For <see cref="MatchReversalKind.MatchStart"/>, this is ignored.
/// For <see cref="MatchReversalKind.FixedLength"/>, this is the full length of the match. The beginning may be found simply
/// by subtracting this length from the end.
/// For <see cref="MatchReversalKind.PartialFixedLength"/>, this is the length of fixed portion of the match.
/// </remarks>
internal int FixedLength { get; }

/// <summary>Gets the adjusted start state to use for partial fixed-length matches.</summary>
/// <remarks>This will be non-null iff <see cref="Kind"/> is <see cref="MatchReversalKind.PartialFixedLength"/>.</remarks>
internal MatchingState<TSet>? AdjustedStartState { get; }
}
}
Original file line number Diff line number Diff line change
@@ -1,14 +1,26 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

namespace System.Text.RegularExpressions.Symbolic;

internal enum MatchReversalKind
namespace System.Text.RegularExpressions.Symbolic
{
/// <summary>The most generic option, run the regex backwards to find beginning of match</summary>
MatchStart,
/// <summary>Part of the reversal is fixed length and can be skipped</summary>
PartialFixedLength,
/// <summary>The entire pattern is fixed length, reversal not necessary</summary>
FixedLength
/// <summary>Specifies the kind of a <see cref="MatchReversalInfo{TSet}"/>.</summary>
internal enum MatchReversalKind
{
/// <summary>The regex should be run in reverse to find beginning of the match.</summary>
MatchStart,

/// <summary>The end of the pattern is of a fixed length and can be skipped as part of running a regex in reverse to find the beginning of the match.</summary>
/// <remarks>
/// Reverse execution is not necessary for a subset of the match.
/// <see cref="MatchReversalInfo{TSet}.FixedLength"/> will contain the length of the fixed portion.
/// </remarks>
PartialFixedLength,

/// <summary>The entire pattern is of a fixed length.</summary>
/// <remarks>
/// Reverse execution is not necessary to find the beginning of the match.
/// <see cref="MatchReversalInfo{TSet}.FixedLength"/> will contain the length of the match.
/// </remarks>
FixedLength
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@ internal MatchingState(SymbolicRegexNode<TSet> node, uint prevCharKind)
NullabilityInfo = BuildNullabilityInfo();
}

internal int NullabilityInfo { get; }

/// <summary>The regular expression that labels this state and gives it its semantics.</summary>
internal SymbolicRegexNode<TSet> Node { get; }

Expand Down Expand Up @@ -98,15 +96,31 @@ internal SymbolicRegexNode<TSet> Next(SymbolicRegexBuilder<TSet> builder, TSet m
return Node.CreateNfaDerivativeWithEffects(builder, minterm, context);
}

/// <summary>
/// Cached nullability check with encoded bits
/// </summary>
/// <summary>Determines whether the node is nullable for the given context.</summary>
/// <remarks>
/// This is functionally equivalent to <see cref="SymbolicRegexNode{TSet}.IsNullableFor(uint)"/>, but using cached
/// answers stored in <see cref="NullabilityInfo"/>.
/// </remarks>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal bool IsNullableFor(uint nextCharKind)
{
return ((1 << (int)nextCharKind) & NullabilityInfo) != 0;
Debug.Assert(nextCharKind is >= 0 and < CharKind.CharKindCount);
return (NullabilityInfo & (1 << (int)nextCharKind)) != 0;
}

/// <summary>Gets the nullability info for the matching state.</summary>
/// <remarks>
/// <list>
/// <item>00000 -> node cannot be nullable</item>
/// <item>00001 -> nullable for General</item>
/// <item>00010 -> nullable for BeginningEnd</item>
/// <item>00100 -> nullable for NewLine</item>
/// <item>01000 -> nullable for NewLineS</item>
/// <item>10000 -> nullable for WordLetter</item>
/// </list>
/// </remarks>
internal int NullabilityInfo { get; }

/// <summary>
/// Builds a <see cref="StateFlags"/> with the relevant flags set.
/// </summary>
Expand Down Expand Up @@ -138,24 +152,16 @@ internal StateFlags BuildStateFlags(bool isInitial)
return info;
}

/// <summary>
/// Builds the nullability information for the matching state.
/// Nullability for each context is encoded in a bit
/// 0 means node cannot be nullable
/// 00001 -> nullable for General
/// 00010 -> nullable for BeginningEnd
/// 00100 -> nullable for NewLine
/// 01000 -> nullable for NewLineS
/// 10000 -> nullable for WordLetter
/// </summary>
internal byte BuildNullabilityInfo()
/// <summary>Builds the nullability information for the matching state.</summary>
/// <remarks>Nullability for each context is encoded in a bit. See <see cref="NullabilityInfo"/>.</remarks>
private byte BuildNullabilityInfo()
{
byte nullabilityInfo = 0;
if (Node.CanBeNullable)
{
for (uint ck = 0; ck < CharKind.CharKindCount; ck++)
for (uint charKind = 0; charKind < CharKind.CharKindCount; charKind++)
{
nullabilityInfo |= (byte)(Node.IsNullableFor(CharKind.Context(PrevCharKind, ck)) ? 1 << (int)ck : 0);
nullabilityInfo |= (byte)(Node.IsNullableFor(CharKind.Context(PrevCharKind, charKind)) ? 1 << (int)charKind : 0);
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.Buffers;
using System.Diagnostics;
using System.Numerics;
using System.Runtime.CompilerServices;

namespace System.Text.RegularExpressions.Symbolic
Expand All @@ -20,12 +22,12 @@ namespace System.Text.RegularExpressions.Symbolic
/// </remarks>
internal sealed class MintermClassifier
{
/// <summary>An array used to map characters to minterms</summary>
/// <summary>Mapping for characters to minterms, used in the vast majority case when there are less than 256 minterms.</summary>
/// <remarks>_lookup[char] provides the minterm ID. If char &gt;= _lookup.Length, its minterm is 0.</remarks>
private readonly byte[]? _lookup;

/// <summary>
/// Fallback lookup if over 255 minterms. This is rarely used.
/// </summary>
/// <summary>Mapping for characters to minterms, used when there are at least 256 minterms. This is rarely used.</summary>
/// <remarks>_intLookup[char] provides the minterm ID. If char &gt;= _intLookup.Length, its minterm is 0.</remarks>
private readonly int[]? _intLookup;

/// <summary>Create a classifier that maps a character to the ID of its associated minterm.</summary>
Expand All @@ -37,61 +39,64 @@ public MintermClassifier(BDD[] minterms)
if (minterms.Length == 1)
{
// With only a single minterm, the mapping is trivial: everything maps to it (ID 0).
_lookup = Array.Empty<byte>();
_lookup = [];
return;
}

int _maxChar = -1;
// attempt to save memory in common cases by allocating only up to the highest char code
// Compute all minterm ranges. We do this here in order to determine the maximum character value
// in order to size the lookup array to minimize steady-state memory consumption of the potentially
// large lookup array. We prefer to use the byte[] _lookup when possible, in order to keep memory
// consumption to a minimum; doing so accomodates up to 255 minterms, which is the vast majority case.
// However, when there are more than 255 minterms, we need to use int[] _intLookup.
(uint, uint)[][] charRangesPerMinterm = ArrayPool<(uint, uint)[]>.Shared.Rent(minterms.Length);

int maxChar = -1;
for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
{
_maxChar = Math.Max(_maxChar, (int)BDDRangeConverter.ToRanges(minterms[mintermId])[^1].Item2);
(uint, uint)[] ranges = BDDRangeConverter.ToRanges(minterms[mintermId]);
charRangesPerMinterm[mintermId] = ranges;
maxChar = Math.Max(maxChar, (int)ranges[^1].Item2);
}

// It's incredibly rare for a regex to use more than a hundred or two minterms,
// but we need a fallback just in case.
// It's incredibly rare for a regex to use more than a couple hundred minterms,
// but we need a fallback just in case. (Over 128 unique sets also means it's never ASCII only.)
if (minterms.Length > 255)
{
// over 255 unique sets also means it's never ascii only
int[] lookup = new int[_maxChar + 1];
for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
{
// precompute all assigned minterm categories
(uint, uint)[] mintermRanges = BDDRangeConverter.ToRanges(minterms[mintermId]);
foreach ((uint start, uint end) in mintermRanges)
{
// assign character ranges in bulk
Span<int> slice = lookup.AsSpan((int)start, (int)(end + 1 - start));
slice.Fill(mintermId);
}
}
_intLookup = lookup;
_intLookup = CreateLookup<int>(minterms, charRangesPerMinterm, maxChar);
}
else
{
byte[] lookup = new byte[_maxChar + 1];
_lookup = CreateLookup<byte>(minterms, charRangesPerMinterm, maxChar);
}

// Return the rented array. We clear it before returning it in order to avoid all the ranges arrays being kept alive.
Array.Clear(charRangesPerMinterm, 0, minterms.Length);
ArrayPool<(uint, uint)[]>.Shared.Return(charRangesPerMinterm);

// Creates the lookup array.
static T[] CreateLookup<T>(BDD[] minterms, ReadOnlySpan<(uint, uint)[]> charRangesPerMinterm, int _maxChar) where T : IBinaryInteger<T>
{
T[] lookup = new T[_maxChar + 1];
for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
{
// precompute all assigned minterm categories
(uint, uint)[] mintermRanges = BDDRangeConverter.ToRanges(minterms[mintermId]);
foreach ((uint start, uint end) in mintermRanges)
// Each minterm maps to a range of characters. Set each of the characters in those ranges to the corresponding minterm.
foreach ((uint start, uint end) in charRangesPerMinterm[mintermId])
{
// assign character ranges in bulk
Span<byte> slice = lookup.AsSpan((int)start, (int)(end + 1 - start));
slice.Fill((byte)mintermId);
lookup.AsSpan((int)start, (int)(end + 1 - start)).Fill(T.CreateTruncating(mintermId));
}
}
_lookup = lookup;

return lookup;
}
}

/// <summary>Gets the ID of the minterm associated with the specified character. </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public int GetMintermID(int c)
{
if (_intLookup is null)
if (_lookup is not null)
{
byte[] lookup = _lookup!;
byte[] lookup = _lookup;
return (uint)c < (uint)lookup.Length ? lookup[c] : 0;
}
else
Expand All @@ -104,20 +109,17 @@ public int GetMintermID(int c)
/// Gets a quick mapping from char to minterm for the common case when there are &lt;= 255 minterms.
/// Null if there are greater than 255 minterms.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public byte[]? ByteLookup() => _lookup;
public byte[]? ByteLookup => _lookup;

/// <summary>
/// Gets a mapping from char to minterm for the rare case when there are &gt;= 255 minterms.
/// Null in the common case where there are fewer than 255 minterms.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public int[]? IntLookup() => _intLookup;
public int[]? IntLookup => _intLookup;

/// <summary>
/// Maximum ordinal character for a non-0 minterm, used to conserve memory
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public int MaxChar() => (_lookup?.Length ?? _intLookup!.Length) - 1;
public int MaxChar => (_lookup?.Length ?? _intLookup!.Length) - 1;
}
}
Loading

0 comments on commit 6bb517a

Please sign in to comment.