From de5d18538076f843a7eb75947eee5d78229cc523 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Thu, 11 Jul 2024 10:26:52 -0400 Subject: [PATCH 01/15] Rent object[] rather than (uint,uint)[][] from the ArrayPool --- .../Symbolic/MintermClassifier.cs | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs index 24d2a26f849229..5bf81a6191c046 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs @@ -47,8 +47,11 @@ public MintermClassifier(BDD[] minterms) // in order to size the lookup array to minimize steady-state memory consumption of the potentially // large lookup array. We prefer to use the byte[] _lookup when possible, in order to keep memory // consumption to a minimum; doing so accomodates up to 255 minterms, which is the vast majority case. - // However, when there are more than 255 minterms, we need to use int[] _intLookup. - (uint, uint)[][] charRangesPerMinterm = ArrayPool<(uint, uint)[]>.Shared.Rent(minterms.Length); + // However, when there are more than 255 minterms, we need to use int[] _intLookup. We rent an object[] + // rather than a (uint,uint)[][] to avoid the extra type pressure on the ArrayPool (object[]s are common, + // (uint,uint)[][]s much less so). + object[] arrayPoolArray = ArrayPool.Shared.Rent(minterms.Length); + Span charRangesPerMinterm = arrayPoolArray.AsSpan(0, minterms.Length); int maxChar = -1; for (int mintermId = 1; mintermId < minterms.Length; mintermId++) @@ -70,17 +73,17 @@ public MintermClassifier(BDD[] minterms) } // Return the rented array. We clear it before returning it in order to avoid all the ranges arrays being kept alive. - Array.Clear(charRangesPerMinterm, 0, minterms.Length); - ArrayPool<(uint, uint)[]>.Shared.Return(charRangesPerMinterm); + charRangesPerMinterm.Clear(); + ArrayPool.Shared.Return(arrayPoolArray); - // Creates the lookup array. - static T[] CreateLookup(BDD[] minterms, ReadOnlySpan<(uint, uint)[]> charRangesPerMinterm, int _maxChar) where T : IBinaryInteger + // Creates the lookup array. charRangesPerMinterm needs to have already been populated with (uint, uint)[] instances. + static T[] CreateLookup(BDD[] minterms, ReadOnlySpan charRangesPerMinterm, int _maxChar) where T : IBinaryInteger { T[] lookup = new T[_maxChar + 1]; for (int mintermId = 1; mintermId < minterms.Length; mintermId++) { // Each minterm maps to a range of characters. Set each of the characters in those ranges to the corresponding minterm. - foreach ((uint start, uint end) in charRangesPerMinterm[mintermId]) + foreach ((uint start, uint end) in ((uint, uint)[])charRangesPerMinterm[mintermId]) { lookup.AsSpan((int)start, (int)(end + 1 - start)).Fill(T.CreateTruncating(mintermId)); } From 6306ab84a243f3984806d594f8a43d94978d93d1 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Thu, 11 Jul 2024 10:38:01 -0400 Subject: [PATCH 02/15] Remove unnecessary TInputReader generic from functions --- .../Symbolic/MintermClassifier.cs | 4 +- .../Symbolic/SymbolicRegexMatcher.cs | 70 +++++++++---------- 2 files changed, 36 insertions(+), 38 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs index 5bf81a6191c046..c336fe93ffe1fb 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs @@ -104,7 +104,9 @@ public int GetMintermID(int c) } else { - int[] lookup = _intLookup!; + Debug.Assert(_intLookup is not null); + + int[] lookup = _intLookup; return (uint)c < (uint)lookup.Length ? lookup[c] : 0; } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 08f423b03344ad..f9e33da926380d 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -332,10 +332,10 @@ internal TSet GetMintermFromId(int mintermId) } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private uint GetCharKind(ReadOnlySpan input, int i) - where TInputReader : struct, IInputReader => !_pattern._info.ContainsSomeAnchor ? + private uint GetCharKind(ReadOnlySpan input, int i) => + !_pattern._info.ContainsSomeAnchor ? CharKind.General : // The previous character kind is irrelevant when anchors are not used. - GetPositionKind(TInputReader.GetPositionId(this, input, i)); + GetPositionKind(FullInputReader.GetPositionId(this, input, i)); private void CheckTimeout(long timeoutOccursAt) { @@ -389,8 +389,8 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i { // Fallback for Z anchor or over 255 minterms matchEnd = _findOpts is not null ? - FindEndPositionFallback(input, startat, timeoutOccursAt, mode, perThreadData) : - FindEndPositionFallback(input, startat, timeoutOccursAt, mode, perThreadData); + FindEndPositionFallback(input, startat, timeoutOccursAt, mode, perThreadData) : + FindEndPositionFallback(input, startat, timeoutOccursAt, mode, perThreadData); } // If there wasn't a match, we're done. @@ -425,7 +425,7 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i if (_optimizedReversalInfo.Kind is MatchReversalKind.MatchStart) { // No fixed-length knowledge. Start at the end of the match. - reversalStartState = new CurrentState(_reverseInitialStates[GetCharKind(input, matchEnd)]); + reversalStartState = new CurrentState(_reverseInitialStates[GetCharKind(input, matchEnd)]); } else { @@ -491,9 +491,9 @@ private int FindEndPositionOptimized(input, pos - 1)]); + var currentState = new CurrentState(_dotstarredInitialStates[GetCharKind(input, pos - 1)]); int endPos = NoMatchExists; int lengthMinus1 = input.Length - 1; @@ -514,7 +514,7 @@ private int FindEndPositionOptimized NfaCharsPerTimeoutCheck ? pos + NfaCharsPerTimeoutCheck : input.Length; - done = FindEndPositionDeltasNFA( + done = FindEndPositionDeltasNFA( input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState, ref endPos, ref initialStatePosCandidate, ref initialStatePosCandidate); } @@ -556,14 +556,13 @@ private int FindEndPositionOptimized /// A one-past-the-end index into input for the preferred match, or first final state position if isMatch is true, or NoMatchExists if no match exists. /// - private int FindEndPositionFallback(ReadOnlySpan input, int pos, long timeoutOccursAt, RegexRunnerMode mode, PerThreadData perThreadData) - where TInputReader : struct, IInputReader + private int FindEndPositionFallback(ReadOnlySpan input, int pos, long timeoutOccursAt, RegexRunnerMode mode, PerThreadData perThreadData) where TFindOptimizationsHandler : struct, IInitialStateHandler where TNullabilityHandler : struct, INullabilityHandler { int initialStatePosCandidate = pos; - var currentState = new CurrentState(_dotstarredInitialStates[GetCharKind(input, pos - 1)]); + var currentState = new CurrentState(_dotstarredInitialStates[GetCharKind(input, pos - 1)]); int endPos = NoMatchExists; int endStateId = -1; @@ -583,7 +582,7 @@ private int FindEndPositionFallback DfaCharsPerTimeoutCheck ? pos + DfaCharsPerTimeoutCheck : input.Length; - done = FindEndPositionDeltasDFA( + done = FindEndPositionDeltasDFA( input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePosCandidate); } else @@ -591,7 +590,7 @@ private int FindEndPositionFallback NfaCharsPerTimeoutCheck ? pos + NfaCharsPerTimeoutCheck : input.Length; - done = FindEndPositionDeltasNFA( + done = FindEndPositionDeltasNFA( input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePosCandidate); } @@ -732,7 +731,7 @@ private bool FindEndPositionDeltasDFAOptimized - /// Workhorse inner loop for . Consumes the character by character, + /// Workhorse inner loop for . Consumes the character by character, /// starting at , for each character transitioning from one state in the DFA or NFA graph to the next state, /// lazily building out the graph as needed. /// @@ -748,10 +747,9 @@ private bool FindEndPositionDeltasDFAOptimized - private bool FindEndPositionDeltasDFA(ReadOnlySpan input, int length, RegexRunnerMode mode, + private bool FindEndPositionDeltasDFA(ReadOnlySpan input, int length, RegexRunnerMode mode, long timeoutOccursAt, ref int posRef, ref CurrentState state, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef) where TStateHandler : struct, IStateHandler - where TInputReader : struct, IInputReader where TFindOptimizationsHandler : struct, IInitialStateHandler where TNullabilityHandler : struct, INullabilityHandler { @@ -776,14 +774,14 @@ private bool FindEndPositionDeltasDFA(this, input, ref state, ref pos)) + if (!TFindOptimizationsHandler.TryFindNextStartingPosition(this, input, ref state, ref pos)) { return true; } initialStatePosCandidate = pos; } - int positionId = TInputReader.GetPositionId(this, input, pos); + int positionId = FullInputReader.GetPositionId(this, input, pos); // If the state is nullable for the next character, meaning it accepts the empty string, // we found a potential end state. @@ -822,7 +820,7 @@ private bool FindEndPositionDeltasDFA - /// Workhorse inner loop for . Consumes the character by character, + /// Workhorse inner loop for . Consumes the character by character, /// starting at , for each character transitioning from one state in the DFA or NFA graph to the next state, /// lazily building out the graph as needed. /// @@ -838,11 +836,10 @@ private bool FindEndPositionDeltasDFA - private bool FindEndPositionDeltasNFA( + private bool FindEndPositionDeltasNFA( ReadOnlySpan input, int length, RegexRunnerMode mode, long timeoutOccursAt, ref int posRef, ref CurrentState state, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef) where TStateHandler : struct, IStateHandler - where TInputReader : struct, IInputReader where TFindOptimizationsHandler : struct, IInitialStateHandler where TNullabilityHandler : struct, INullabilityHandler { @@ -864,7 +861,7 @@ private bool FindEndPositionDeltasNFA(ReadOnlySpan input, int i, where TInputReader : struct, IInputReader { // Pick the correct start state based on previous character kind. - MatchingState initialState = _initialStates[GetCharKind(input, i - 1)]; + MatchingState initialState = _initialStates[GetCharKind(input, i - 1)]; Registers initialRegisters = perThreadData.InitialRegisters; @@ -1117,7 +1114,7 @@ private Registers FindSubcaptures(ReadOnlySpan input, int i, StateFlags flags = _stateFlagsArray[coreStateId]; Debug.Assert(coreStateId != _deadStateId); - if (flags.IsNullable() || (flags.CanBeNullable() && GetState(coreStateId).IsNullableFor(GetCharKind(input, i + 1)))) + if (flags.IsNullable() || (flags.CanBeNullable() && GetState(coreStateId).IsNullableFor(GetCharKind(input, i + 1)))) { // No lower priority transitions from this or other source states are taken because the // backtracking engines would return the match ending here. @@ -1140,11 +1137,11 @@ private Registers FindSubcaptures(ReadOnlySpan input, int i, foreach ((int endStateId, Registers endRegisters) in current.Values) { MatchingState endState = GetState(GetCoreStateId(endStateId)); - if (endState.IsNullableFor(GetCharKind(input, iEnd))) + if (endState.IsNullableFor(GetCharKind(input, iEnd))) { // Apply effects for finishing at the stored end state endState.Node.ApplyEffects((effect, args) => args.Registers.ApplyEffect(effect, args.Pos), - CharKind.Context(endState.PrevCharKind, GetCharKind(input, iEnd)), (Registers: endRegisters, Pos: iEnd)); + CharKind.Context(endState.PrevCharKind, GetCharKind(input, iEnd)), (Registers: endRegisters, Pos: iEnd)); return endRegisters; } } @@ -1449,7 +1446,7 @@ public static bool IsNullableFor(SymbolicRegexMatcher matcher, in CurrentS /// Gets the preferred DFA state for nullability. In DFA mode this is just the state itself. public static int ExtractNullableCoreStateId(SymbolicRegexMatcher matcher, in CurrentState state, ReadOnlySpan input, int pos) { - uint nextCharKind = matcher.GetCharKind(input, pos); + uint nextCharKind = matcher.GetCharKind(input, pos); foreach (ref KeyValuePair nfaState in CollectionsMarshal.AsSpan(state.NfaState!.NfaStateSet.Values)) { MatchingState coreState = matcher.GetState(matcher.GetCoreStateId(nfaState.Key)); @@ -1613,7 +1610,9 @@ private interface IInputReader private readonly struct NoZAnchorInputReader : IInputReader { public static int GetPositionId(SymbolicRegexMatcher matcher, ReadOnlySpan input, int pos) => - (uint)pos < (uint)input.Length ? matcher._mintermClassifier.GetMintermID(input[pos]) : -1; + (uint)pos < (uint)input.Length ? + matcher._mintermClassifier.GetMintermID(input[pos]) : + -1; } /// This reader includes full handling of an \n as the last character of input for the \Z anchor. @@ -1636,10 +1635,9 @@ public static int GetPositionId(SymbolicRegexMatcher matcher, ReadOnlySpan private interface IInitialStateHandler { - public static abstract bool TryFindNextStartingPosition( + public static abstract bool TryFindNextStartingPosition( SymbolicRegexMatcher matcher, ReadOnlySpan input, - ref CurrentState state, ref int pos) - where TInputReader : struct, IInputReader; + ref CurrentState state, ref int pos); } /// @@ -1713,8 +1711,7 @@ public static bool TryFindNextStartingPosition( private readonly struct NoOptimizationsInitialStateHandler : IInitialStateHandler { [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, ReadOnlySpan input, ref CurrentState state, ref int pos) - where TInputReader : struct, IInputReader => + public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, ReadOnlySpan input, ref CurrentState state, ref int pos) => true; // the current position is a possible starting position } @@ -1724,15 +1721,14 @@ public static bool TryFindNextStartingPosition(SymbolicRegexMatche private readonly struct InitialStateFindOptimizationsHandler : IInitialStateHandler { [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, ReadOnlySpan input, ref CurrentState state, ref int pos) - where TInputReader : struct, IInputReader + public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, ReadOnlySpan input, ref CurrentState state, ref int pos) { // Find the first position that matches with some likely character. if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) { // Update the starting state based on where TryFindNextStartingPosition moved us to. // As with the initial starting state, if it's a dead end, no match exists. - state = new CurrentState(matcher._dotstarredInitialStates[matcher.GetCharKind(input, pos - 1)]); + state = new CurrentState(matcher._dotstarredInitialStates[matcher.GetCharKind(input, pos - 1)]); return true; } From d2f41ff69dacc16f583e955f6e47654e0943334b Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Thu, 11 Jul 2024 11:06:13 -0400 Subject: [PATCH 03/15] Add more comments and do some renames --- .../Symbolic/SymbolicRegexMatcher.Sample.cs | 2 +- .../Symbolic/SymbolicRegexMatcher.cs | 134 ++++++++++-------- 2 files changed, 76 insertions(+), 60 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Sample.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Sample.cs index 6d9e5baca69a8d..ac7cd0c4b6d890 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Sample.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Sample.cs @@ -71,7 +71,7 @@ public override IEnumerable SampleMatches(int k, int randomseed) NfaMatchingState states = new(); // Here one could also consider previous characters for example for \b, \B, and ^ anchors // and initialize inputSoFar accordingly - states.InitializeFrom(this, _initialStates[GetCharKind([], -1)]); + states.InitializeFrom(this, _initialStates[GetCharKind([], -1)]); CurrentState statesWrapper = new(states); // Used for end suffixes diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index f9e33da926380d..32971bdb0d2be1 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -335,7 +335,7 @@ internal TSet GetMintermFromId(int mintermId) private uint GetCharKind(ReadOnlySpan input, int i) => !_pattern._info.ContainsSomeAnchor ? CharKind.General : // The previous character kind is irrelevant when anchors are not used. - GetPositionKind(FullInputReader.GetPositionId(this, input, i)); + GetPositionKind(DefaultInputReader.GetPositionId(this, input, i)); private void CheckTimeout(long timeoutOccursAt) { @@ -379,18 +379,18 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i // Optimize processing for the common case of no Z anchor and <= 255 minterms. Specialize each call with different generic method arguments. matchEnd = (_findOpts is not null, _containsAnyAnchor) switch { - (true, true) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), + (true, true) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), (true, false) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), (false, false) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), - (false, true) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), + (false, true) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), }; } else { // Fallback for Z anchor or over 255 minterms matchEnd = _findOpts is not null ? - FindEndPositionFallback(input, startat, timeoutOccursAt, mode, perThreadData) : - FindEndPositionFallback(input, startat, timeoutOccursAt, mode, perThreadData); + FindEndPositionFallback(input, startat, timeoutOccursAt, mode, perThreadData) : + FindEndPositionFallback(input, startat, timeoutOccursAt, mode, perThreadData); } // If there wasn't a match, we're done. @@ -436,8 +436,8 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i // reversal may already be nullable here in the case of anchors if (_containsAnyAnchor && _nullabilityArray[reversalStartState.DfaStateId] > 0 && - FullNullabilityHandler.IsNullableAt( - this, in reversalStartState, FullInputReader.GetPositionId(this, input, i), + DefaultNullabilityHandler.IsNullableAt( + this, in reversalStartState, DefaultInputReader.GetPositionId(this, input, i), DfaStateHandler.GetStateFlags(this, in reversalStartState))) { initialLastStart = i; @@ -448,10 +448,10 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i { // Call FindStartPosition with generic method arguments based on the presence of anchors. This is purely an optimization; // the (true, true) case is functionally complete whereas the (false, false) case is the most optimized. - (true, true) => FindStartPosition(reversalStartState, initialLastStart, input, i, startat, perThreadData), - (true, false) => FindStartPosition(reversalStartState, initialLastStart, input, i, startat, perThreadData), - (false, true) => FindStartPosition(reversalStartState, initialLastStart, input, i, startat, perThreadData), - (false, false) => FindStartPosition(reversalStartState, initialLastStart, input, i, startat, perThreadData), + (true, true) => FindStartPosition(reversalStartState, initialLastStart, input, i, startat, perThreadData), + (true, false) => FindStartPosition(reversalStartState, initialLastStart, input, i, startat, perThreadData), + (false, true) => FindStartPosition(reversalStartState, initialLastStart, input, i, startat, perThreadData), + (false, false) => FindStartPosition(reversalStartState, initialLastStart, input, i, startat, perThreadData), }; break; @@ -477,8 +477,8 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i else { Registers endRegisters = _containsAnyAnchor ? - FindSubcaptures(input, matchStart, matchEnd, perThreadData) : - FindSubcaptures(input, matchStart, matchEnd, perThreadData); + FindSubcaptures(input, matchStart, matchEnd, perThreadData) : + FindSubcaptures(input, matchStart, matchEnd, perThreadData); return new SymbolicMatch(matchStart, matchEnd - matchStart, endRegisters.CaptureStarts, endRegisters.CaptureEnds); } } @@ -514,7 +514,7 @@ private int FindEndPositionOptimized NfaCharsPerTimeoutCheck ? pos + NfaCharsPerTimeoutCheck : input.Length; - done = FindEndPositionDeltasNFA( + done = FindEndPositionDeltasNFA( input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState, ref endPos, ref initialStatePosCandidate, ref initialStatePosCandidate); } @@ -781,7 +781,7 @@ private bool FindEndPositionDeltasDFA /// Interface for mapping positions in the input to position IDs, which capture all the information necessary to - /// both take transitions and decide nullability. For positions of valid characters that are handled normally, - /// these IDs coincide with minterm IDs (i.e. indices to ). Positions outside the bounds - /// of the input are mapped to -1. Optionally, an end-of-line as the very last character in the input may be - /// mapped to _minterms.Length for supporting the \Z anchor. + /// both take transitions and decide nullability. /// private interface IInputReader { + /// Gets the position ID for the specified character in the input. + /// + /// For positions of valid characters that are handled normally, these IDs coincide with minterm IDs (i.e. indices to ). + /// Positions outside the bounds of the input are mapped to -1. Optionally, an end-of-line as the very last character in the input may be + /// mapped to _minterms.Length for supporting the \Z anchor. The and parameters are specified + /// separately, rather than input[pos] being passed in as a single , because some inputs need to act differently + /// based on the position itself. + /// public static abstract int GetPositionId(SymbolicRegexMatcher matcher, ReadOnlySpan input, int pos); } - /// This reader omits the special handling of \n for the \Z anchor. - private readonly struct NoZAnchorInputReader : IInputReader - { - public static int GetPositionId(SymbolicRegexMatcher matcher, ReadOnlySpan input, int pos) => - (uint)pos < (uint)input.Length ? - matcher._mintermClassifier.GetMintermID(input[pos]) : - -1; - } - - /// This reader includes full handling of an \n as the last character of input for the \Z anchor. - private readonly struct FullInputReader : IInputReader + /// Provides an input reader that includes full handling of an \n as the last character of input for the \Z anchor. + private readonly struct DefaultInputReader : IInputReader { + /// + /// Gets the minterm ID of the specified character, -1 if the position isn't within the input, or .Length + /// for a \n at the very end of the input. + /// public static int GetPositionId(SymbolicRegexMatcher matcher, ReadOnlySpan input, int pos) { if ((uint)pos < (uint)input.Length) @@ -1633,6 +1633,16 @@ public static int GetPositionId(SymbolicRegexMatcher matcher, ReadOnlySpan } } + /// Provides an optimized input reader that doesn't provide special-handling of \n at the end of the input for the \Z anchor. + private readonly struct NoZAnchorOptimizedInputReader : IInputReader + { + /// Gets the minterm ID of the specified character, or -1 if the position isn't within the input. + public static int GetPositionId(SymbolicRegexMatcher matcher, ReadOnlySpan input, int pos) => + (uint)pos < (uint)input.Length ? + matcher._mintermClassifier.GetMintermID(input[pos]) : + -1; + } + private interface IInitialStateHandler { public static abstract bool TryFindNextStartingPosition( @@ -1737,27 +1747,30 @@ public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matche } } - /// Interface for evaluating nullability of states. + /// Represents a handler for evaluating nullability of states. private interface INullabilityHandler { + /// Gets whether the specified position is nullable. public static abstract bool IsNullableAt( SymbolicRegexMatcher matcher, in CurrentState state, int positionId, StateFlags flags) where TStateHandler : struct, IStateHandler; } - /// This nullability handler interface can be used in DFAs for patterns that do not contain \Z. - private interface IOptimizedNullabilityHandler + /// Nullability handler that will work for any pattern. + private readonly struct DefaultNullabilityHandler : INullabilityHandler { - public static abstract bool IsNullable( - SymbolicRegexMatcher matcher, byte[] nullabilityArray, int currStateId, - byte[] lookup, ReadOnlySpan input, int pos); + /// Gets whether the specified position is nullable. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static bool IsNullableAt(SymbolicRegexMatcher matcher, in CurrentState state, int positionId, StateFlags flags) + where TStateHandler : struct, IStateHandler => + flags.IsNullable() || + (flags.CanBeNullable() && TStateHandler.IsNullableFor(matcher, in state, matcher.GetPositionKind(positionId))); } - /// - /// Specialized nullability handler for patterns without any anchors. - /// + /// Nullability handler for patterns without any anchors. private readonly struct NoAnchorsNullabilityHandler : INullabilityHandler { + /// Gets whether the specified position is nullable. [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool IsNullableAt(SymbolicRegexMatcher matcher, in CurrentState state, int positionId, StateFlags flags) where TStateHandler : struct, IStateHandler @@ -1767,44 +1780,47 @@ public static bool IsNullableAt(SymbolicRegexMatcher matche } } - /// - /// Nullability handler that will work for any pattern. - /// - private readonly struct FullNullabilityHandler : INullabilityHandler + /// Represents a handler for evaluating nullability of states and for use in DFAs for patterns that do not contain \Z anchors. + private interface IOptimizedNullabilityHandler { - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool IsNullableAt(SymbolicRegexMatcher matcher, in CurrentState state, int positionId, StateFlags flags) - where TStateHandler : struct, IStateHandler => - flags.IsNullable() || - (flags.CanBeNullable() && TStateHandler.IsNullableFor(matcher, in state, matcher.GetPositionKind(positionId))); + /// Gets whether the specified position is nullable. + public static abstract bool IsNullable( + SymbolicRegexMatcher matcher, byte[] nullabilityArray, int currStateId, + byte[] lookup, ReadOnlySpan input, int pos); } - private readonly struct NoAnchorOptimizedNullabilityHandler : IOptimizedNullabilityHandler + /// Optimized nullability handler that works regardless of what additional anchors may exist in a pattern. + private readonly struct DefaultOptimizedNullabilityHandler : IOptimizedNullabilityHandler { + /// Gets whether the specified position is nullable. [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool IsNullable(SymbolicRegexMatcher matcher, byte[] nullabilityArray, int currStateId, byte[] lookup, ReadOnlySpan input, int pos) { Debug.Assert(pos >= 0 && pos < input.Length, "input end should not be handled here"); Debug.Assert(currStateId < nullabilityArray.Length, "nullabilityArray grown but the reference is not up to date"); - return nullabilityArray[currStateId] > 0; + + if (nullabilityArray[currStateId] > 0) + { + char c = input[pos]; + return matcher.IsNullableWithContext(currStateId, c < (uint)lookup.Length ? lookup[c] : 0); + } + + return false; } } - private readonly struct AnchorOptimizedNullabilityHandler : IOptimizedNullabilityHandler + /// Optimized nullability handler for when a pattern has no anchors at all. + private readonly struct NoAnchorOptimizedNullabilityHandler : IOptimizedNullabilityHandler { + /// Gets whether the specified position is nullable. [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool IsNullable(SymbolicRegexMatcher matcher, byte[] nullabilityArray, int currStateId, byte[] lookup, ReadOnlySpan input, int pos) { Debug.Assert(pos >= 0 && pos < input.Length, "input end should not be handled here"); Debug.Assert(currStateId < nullabilityArray.Length, "nullabilityArray grown but the reference is not up to date"); + Debug.Assert(!matcher._pattern._info.ContainsSomeAnchor); - if (nullabilityArray[currStateId] > 0) - { - char c = input[pos]; - return matcher.IsNullableWithContext(currStateId, c < (uint)lookup.Length ? lookup[c] : 0); - } - - return false; + return nullabilityArray[currStateId] > 0; } } } From 9b51dc675567927f2af96af97c5b1fcc27983f27 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Thu, 11 Jul 2024 11:07:00 -0400 Subject: [PATCH 04/15] Remove unused TFindOptimizationsHandler from FindEndPositionDeltasNFA --- .../RegularExpressions/Symbolic/SymbolicRegexMatcher.cs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 32971bdb0d2be1..a117eff279d92f 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -514,7 +514,7 @@ private int FindEndPositionOptimized NfaCharsPerTimeoutCheck ? pos + NfaCharsPerTimeoutCheck : input.Length; - done = FindEndPositionDeltasNFA( + done = FindEndPositionDeltasNFA( input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState, ref endPos, ref initialStatePosCandidate, ref initialStatePosCandidate); } @@ -590,7 +590,7 @@ private int FindEndPositionFallback NfaCharsPerTimeoutCheck ? pos + NfaCharsPerTimeoutCheck : input.Length; - done = FindEndPositionDeltasNFA( + done = FindEndPositionDeltasNFA( input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePosCandidate); } @@ -836,11 +836,10 @@ private bool FindEndPositionDeltasDFA - private bool FindEndPositionDeltasNFA( + private bool FindEndPositionDeltasNFA( ReadOnlySpan input, int length, RegexRunnerMode mode, long timeoutOccursAt, ref int posRef, ref CurrentState state, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef) where TStateHandler : struct, IStateHandler - where TFindOptimizationsHandler : struct, IInitialStateHandler where TNullabilityHandler : struct, INullabilityHandler { // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning. From 52718b4fadf6fe6567d04fdfe868fdf0ce1300fe Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Thu, 11 Jul 2024 12:13:02 -0400 Subject: [PATCH 05/15] Fix a stray input reader --- .../RegularExpressions/Symbolic/SymbolicRegexMatcher.Sample.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Sample.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Sample.cs index ac7cd0c4b6d890..2bcf4217b790eb 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Sample.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Sample.cs @@ -71,7 +71,7 @@ public override IEnumerable SampleMatches(int k, int randomseed) NfaMatchingState states = new(); // Here one could also consider previous characters for example for \b, \B, and ^ anchors // and initialize inputSoFar accordingly - states.InitializeFrom(this, _initialStates[GetCharKind([], -1)]); + states.InitializeFrom(this, _initialStates[GetCharKind([], -1)]); CurrentState statesWrapper = new(states); // Used for end suffixes From a922f062a37100cba58c4ed0db1ae9d026c4e3b9 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Thu, 11 Jul 2024 11:11:25 -0400 Subject: [PATCH 06/15] Some more renames --- .../Symbolic/SymbolicRegexMatcher.cs | 118 +++++++++--------- 1 file changed, 60 insertions(+), 58 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index a117eff279d92f..131d56c65dd3eb 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -379,10 +379,10 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i // Optimize processing for the common case of no Z anchor and <= 255 minterms. Specialize each call with different generic method arguments. matchEnd = (_findOpts is not null, _containsAnyAnchor) switch { - (true, true) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), - (true, false) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), - (false, false) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), - (false, true) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), + (true, true) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), + (true, false) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), + (false, false) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), + (false, true) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), }; } else @@ -489,7 +489,7 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i private int FindEndPositionOptimized( ReadOnlySpan input, int pos, long timeoutOccursAt, RegexRunnerMode mode, PerThreadData perThreadData) where TAcceleratedStateHandler : struct, IAcceleratedStateHandler - where TOptimizedNullabilityHandler : struct, IOptimizedNullabilityHandler + where TOptimizedNullabilityHandler : struct, IDfaNoZAnchorOptimizedNullabilityHandler { // Initial state candidate. (This is not used in the common DFA case and could potentially be removed in the future.) int initialStatePosCandidate = pos; @@ -632,7 +632,7 @@ private bool FindEndPositionDeltasDFAOptimized input, int lengthMinus1, RegexRunnerMode mode, long timeoutOccursAt, ref int posRef, ref int currentStateIdRef, ref int endPosRef) where TAcceleratedStateHandler : struct, IAcceleratedStateHandler - where TOptimizedNullabilityHandler : struct, IOptimizedNullabilityHandler + where TOptimizedNullabilityHandler : struct, IDfaNoZAnchorOptimizedNullabilityHandler { // Initial check for input end lifted out of the subsequent hot-path loop. if (posRef == input.Length) @@ -1642,11 +1642,43 @@ public static int GetPositionId(SymbolicRegexMatcher matcher, ReadOnlySpan -1; } + /// Represents a handler used to determine the next possible starting position. private interface IInitialStateHandler { - public static abstract bool TryFindNextStartingPosition( - SymbolicRegexMatcher matcher, ReadOnlySpan input, - ref CurrentState state, ref int pos); + /// Gets the next viable starting position. + /// true if a viable starting position was found; false if no further possible match exists. + public static abstract bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, ReadOnlySpan input, ref CurrentState state, ref int pos); + } + + /// Provides an initial state handler for when there are no initial state optimizations to apply. + private readonly struct NoOptimizationsInitialStateHandler : IInitialStateHandler + { + /// Returns true. No optimizations are known to be able to skip states, thus every position is a viable starting position. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, ReadOnlySpan input, ref CurrentState state, ref int pos) => + true; + } + + /// Provides a handler that uses the matcher's to optimize searching for the next viable starting state. + private readonly struct InitialStateFindOptimizationsHandler : IInitialStateHandler + { + /// Gets the next viable starting position. + /// true if a viable starting position was found; false if no further possible match exists. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, ReadOnlySpan input, ref CurrentState state, ref int pos) + { + // Find the first position that matches with some likely character. + if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) + { + // Update the starting state based on where TryFindNextStartingPosition moved us to. + // As with the initial starting state, if it's a dead end, no match exists. + state = new CurrentState(matcher._dotstarredInitialStates[matcher.GetCharKind(input, pos - 1)]); + return true; + } + + // No match exists + return false; + } } /// @@ -1659,29 +1691,15 @@ public static abstract bool TryFindNextStartingPosition( byte[] lookup, ref int currentStateId, ref int pos, int initialStateId); } - private readonly struct NoAnchorAcceleratedStateHandler : IAcceleratedStateHandler + private readonly struct NoOptimizationsAcceleratedStateHandler : IAcceleratedStateHandler { [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool TryFindNextStartingPosition( - SymbolicRegexMatcher matcher, ReadOnlySpan input, byte[] lookup, ref int currentStateId, ref int pos, int initialStateId) - { - if (currentStateId != initialStateId) - { - return false; - } - - if (!matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) - { - // No match exists - currentStateId = matcher._deadStateId; - pos = input.Length; - } - - return true; - } + SymbolicRegexMatcher matcher, ReadOnlySpan input, byte[] lookup, ref int currentStateId, ref int pos, int initialStateId) => + false; } - private readonly struct AcceleratedStateHandler : IAcceleratedStateHandler + private readonly struct RegexFindOpsAcceleratedStateHandler : IAcceleratedStateHandler { [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, @@ -1708,41 +1726,25 @@ public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matche } } - private readonly struct NoAcceleratedStateHandler : IAcceleratedStateHandler + private readonly struct NoAnchorRegexFindOpsAcceleratedStateHandler : IAcceleratedStateHandler { [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool TryFindNextStartingPosition( - SymbolicRegexMatcher matcher, ReadOnlySpan input, byte[] lookup, ref int currentStateId, ref int pos, int initialStateId) => - false; - } - - /// No-op handler for when there are no initial state optimizations to apply. - private readonly struct NoOptimizationsInitialStateHandler : IInitialStateHandler - { - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, ReadOnlySpan input, ref CurrentState state, ref int pos) => - true; // the current position is a possible starting position - } - - /// - /// Handler for when a instance is available. - /// - private readonly struct InitialStateFindOptimizationsHandler : IInitialStateHandler - { - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, ReadOnlySpan input, ref CurrentState state, ref int pos) + SymbolicRegexMatcher matcher, ReadOnlySpan input, byte[] lookup, ref int currentStateId, ref int pos, int initialStateId) { - // Find the first position that matches with some likely character. - if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) + if (currentStateId != initialStateId) { - // Update the starting state based on where TryFindNextStartingPosition moved us to. - // As with the initial starting state, if it's a dead end, no match exists. - state = new CurrentState(matcher._dotstarredInitialStates[matcher.GetCharKind(input, pos - 1)]); - return true; + return false; } - // No match exists - return false; + if (!matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) + { + // No match exists + currentStateId = matcher._deadStateId; + pos = input.Length; + } + + return true; } } @@ -1780,7 +1782,7 @@ public static bool IsNullableAt(SymbolicRegexMatcher matche } /// Represents a handler for evaluating nullability of states and for use in DFAs for patterns that do not contain \Z anchors. - private interface IOptimizedNullabilityHandler + private interface IDfaNoZAnchorOptimizedNullabilityHandler { /// Gets whether the specified position is nullable. public static abstract bool IsNullable( @@ -1789,7 +1791,7 @@ public static abstract bool IsNullable( } /// Optimized nullability handler that works regardless of what additional anchors may exist in a pattern. - private readonly struct DefaultOptimizedNullabilityHandler : IOptimizedNullabilityHandler + private readonly struct DefaultDfaNoZAnchorOptimizedNullabilityHandler : IDfaNoZAnchorOptimizedNullabilityHandler { /// Gets whether the specified position is nullable. [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -1809,7 +1811,7 @@ public static bool IsNullable(SymbolicRegexMatcher matcher, byte[] nullabi } /// Optimized nullability handler for when a pattern has no anchors at all. - private readonly struct NoAnchorOptimizedNullabilityHandler : IOptimizedNullabilityHandler + private readonly struct NoAnchorDfaOptimizedNullabilityHandler : IDfaNoZAnchorOptimizedNullabilityHandler { /// Gets whether the specified position is nullable. [MethodImpl(MethodImplOptions.AggressiveInlining)] From 73c8226e7bcc1d2ffa7ed4888ecc2cb555de25ab Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Thu, 11 Jul 2024 12:21:53 -0400 Subject: [PATCH 07/15] Avoid duplicated reads of input character and nullability info --- .../Symbolic/SymbolicRegexMatcher.Automata.cs | 4 +- .../Symbolic/SymbolicRegexMatcher.cs | 53 +++++++------------ 2 files changed, 21 insertions(+), 36 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs index 327f5666f9e2a5..8c3960a6fb3502 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs @@ -120,8 +120,8 @@ private static void ArrayResizeAndVolatilePublish(ref T[] array, int newSize) /// Pre-computed hot-loop version of nullability check /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - private bool IsNullableWithContext(int stateId, int mintermId) => - (_nullabilityArray[stateId] & (1 << (int)GetPositionKind(mintermId))) > 0; + private bool IsNullableWithContext(byte stateNullability, int mintermId) => + (stateNullability & (1 << (int)GetPositionKind(mintermId))) != 0; /// Returns the span from that may contain transitions for the given state private Span GetDeltasFor(MatchingState state) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 131d56c65dd3eb..37417d9e102a23 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -650,7 +650,7 @@ private bool FindEndPositionDeltasDFAOptimized= lengthMinus1) { if (pos + 1 < input.Length) @@ -1149,15 +1152,12 @@ private Registers FindSubcaptures(ReadOnlySpan input, int i, return default; } - /// Look up the min term ID for the character at the specified position in the input. + /// Look up the min term ID for the character. [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static int GetMintermId(byte[] mintermLookup, ReadOnlySpan input, int pos) - { - Debug.Assert(pos >= 0 && pos < input.Length); - - char c = input[pos]; - return c < (uint)mintermLookup.Length ? mintermLookup[c] : 0; - } + private static int GetMintermId(byte[] mintermLookup, char c) => + c < (uint)mintermLookup.Length ? + mintermLookup[c] : + 0; /// Stores additional data for tracking capture start and end positions. /// The NFA simulation based third phase has one of these for each current state in the current set of live states. @@ -1713,7 +1713,7 @@ public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matche if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) { - currentStateId = matcher._dotstarredInitialStates[matcher._positionKinds[GetMintermId(lookup, input, pos - 1) + 1]].Id; + currentStateId = matcher._dotstarredInitialStates[matcher._positionKinds[GetMintermId(lookup, input[pos - 1]) + 1]].Id; } else { @@ -1785,9 +1785,7 @@ public static bool IsNullableAt(SymbolicRegexMatcher matche private interface IDfaNoZAnchorOptimizedNullabilityHandler { /// Gets whether the specified position is nullable. - public static abstract bool IsNullable( - SymbolicRegexMatcher matcher, byte[] nullabilityArray, int currStateId, - byte[] lookup, ReadOnlySpan input, int pos); + public static abstract bool IsNullable(SymbolicRegexMatcher matcher, byte stateNullability, char c, byte[] lookup); } /// Optimized nullability handler that works regardless of what additional anchors may exist in a pattern. @@ -1795,19 +1793,9 @@ public static abstract bool IsNullable( { /// Gets whether the specified position is nullable. [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool IsNullable(SymbolicRegexMatcher matcher, byte[] nullabilityArray, int currStateId, byte[] lookup, ReadOnlySpan input, int pos) - { - Debug.Assert(pos >= 0 && pos < input.Length, "input end should not be handled here"); - Debug.Assert(currStateId < nullabilityArray.Length, "nullabilityArray grown but the reference is not up to date"); - - if (nullabilityArray[currStateId] > 0) - { - char c = input[pos]; - return matcher.IsNullableWithContext(currStateId, c < (uint)lookup.Length ? lookup[c] : 0); - } - - return false; - } + public static bool IsNullable(SymbolicRegexMatcher matcher, byte stateNullability, char c, byte[] lookup) => + stateNullability != 0 && + matcher.IsNullableWithContext(stateNullability, c < (uint)lookup.Length ? lookup[c] : 0); } /// Optimized nullability handler for when a pattern has no anchors at all. @@ -1815,13 +1803,10 @@ public static bool IsNullable(SymbolicRegexMatcher matcher, byte[] nullabi { /// Gets whether the specified position is nullable. [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool IsNullable(SymbolicRegexMatcher matcher, byte[] nullabilityArray, int currStateId, byte[] lookup, ReadOnlySpan input, int pos) + public static bool IsNullable(SymbolicRegexMatcher matcher, byte stateNullability, char c, byte[] lookup) { - Debug.Assert(pos >= 0 && pos < input.Length, "input end should not be handled here"); - Debug.Assert(currStateId < nullabilityArray.Length, "nullabilityArray grown but the reference is not up to date"); Debug.Assert(!matcher._pattern._info.ContainsSomeAnchor); - - return nullabilityArray[currStateId] > 0; + return stateNullability != 0; } } } From 341c6b7a10cb084d7ad076a6ed8e998ad7a4b771 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Thu, 11 Jul 2024 15:13:30 -0400 Subject: [PATCH 08/15] Remove initialStateId from TryFindNextStartingPosition and make initial accelerators more similar --- .../Symbolic/SymbolicRegexMatcher.cs | 39 +++++++------------ 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 37417d9e102a23..75089c0738fd0d 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -664,7 +664,8 @@ private bool FindEndPositionDeltasDFAOptimizedGets the next viable starting position. /// true if a viable starting position was found; false if no further possible match exists. - public static abstract bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, ReadOnlySpan input, ref CurrentState state, ref int pos); + public static abstract bool TryFindNextStartingPosition( + SymbolicRegexMatcher matcher, ReadOnlySpan input, ref int currentStateId, ref int pos); } /// Provides an initial state handler for when there are no initial state optimizations to apply. @@ -1655,7 +1656,8 @@ private interface IInitialStateHandler { /// Returns true. No optimizations are known to be able to skip states, thus every position is a viable starting position. [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, ReadOnlySpan input, ref CurrentState state, ref int pos) => + public static bool TryFindNextStartingPosition( + SymbolicRegexMatcher matcher, ReadOnlySpan input, ref int currentStateId, ref int pos) => true; } @@ -1665,14 +1667,15 @@ public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matche /// Gets the next viable starting position. /// true if a viable starting position was found; false if no further possible match exists. [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, ReadOnlySpan input, ref CurrentState state, ref int pos) + public static bool TryFindNextStartingPosition( + SymbolicRegexMatcher matcher, ReadOnlySpan input, ref int currentStateId, ref int pos) { // Find the first position that matches with some likely character. if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) { // Update the starting state based on where TryFindNextStartingPosition moved us to. // As with the initial starting state, if it's a dead end, no match exists. - state = new CurrentState(matcher._dotstarredInitialStates[matcher.GetCharKind(input, pos - 1)]); + currentStateId = matcher._dotstarredInitialStates[matcher.GetCharKind(input, pos - 1)].Id; return true; } @@ -1687,30 +1690,23 @@ public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matche private interface IAcceleratedStateHandler { public static abstract bool TryFindNextStartingPosition( - SymbolicRegexMatcher matcher, ReadOnlySpan input, - byte[] lookup, ref int currentStateId, ref int pos, int initialStateId); + SymbolicRegexMatcher matcher, ReadOnlySpan input, ref int currentStateId, ref int pos, byte[] lookup); } private readonly struct NoOptimizationsAcceleratedStateHandler : IAcceleratedStateHandler { [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool TryFindNextStartingPosition( - SymbolicRegexMatcher matcher, ReadOnlySpan input, byte[] lookup, ref int currentStateId, ref int pos, int initialStateId) => + SymbolicRegexMatcher matcher, ReadOnlySpan input, ref int currentStateId, ref int pos, byte[] lookup) => false; } private readonly struct RegexFindOpsAcceleratedStateHandler : IAcceleratedStateHandler { [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, - ReadOnlySpan input, - byte[] lookup, ref int currentStateId, ref int pos, int initialStateId) + public static bool TryFindNextStartingPosition( + SymbolicRegexMatcher matcher, ReadOnlySpan input, ref int currentStateId, ref int pos, byte[] lookup) { - if (currentStateId != initialStateId) - { - return false; - } - if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) { currentStateId = matcher._dotstarredInitialStates[matcher._positionKinds[GetMintermId(lookup, input[pos - 1]) + 1]].Id; @@ -1730,13 +1726,8 @@ public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matche { [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool TryFindNextStartingPosition( - SymbolicRegexMatcher matcher, ReadOnlySpan input, byte[] lookup, ref int currentStateId, ref int pos, int initialStateId) + SymbolicRegexMatcher matcher, ReadOnlySpan input, ref int currentStateId, ref int pos, byte[] lookup) { - if (currentStateId != initialStateId) - { - return false; - } - if (!matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) { // No match exists From 9193d6fe0bf9baed763836d63dc8a70c6aa36242 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Thu, 11 Jul 2024 15:20:34 -0400 Subject: [PATCH 09/15] Remove unused initialStatePos / initialStatePosCandidate It's only ever written and not actually used for anything. --- .../Symbolic/SymbolicRegexMatcher.cs | 41 +++++-------------- 1 file changed, 11 insertions(+), 30 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 75089c0738fd0d..9ff2d510f0e99a 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -491,8 +491,7 @@ private int FindEndPositionOptimized NfaCharsPerTimeoutCheck ? pos + NfaCharsPerTimeoutCheck : input.Length; done = FindEndPositionDeltasNFA( input, innerLoopLength, mode, timeoutOccursAt, ref pos, - ref currentState, ref endPos, ref initialStatePosCandidate, ref initialStatePosCandidate); + ref currentState, ref endPos); } // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or @@ -560,12 +559,9 @@ private int FindEndPositionFallback DfaCharsPerTimeoutCheck ? pos + DfaCharsPerTimeoutCheck : input.Length; done = FindEndPositionDeltasDFA( - input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePosCandidate); + input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState, ref endPos); } else { @@ -591,7 +587,7 @@ private int FindEndPositionFallback NfaCharsPerTimeoutCheck ? pos + NfaCharsPerTimeoutCheck : input.Length; done = FindEndPositionDeltasNFA( - input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePosCandidate); + input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState, ref endPos); } // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or @@ -747,12 +743,11 @@ private bool FindEndPositionDeltasDFAOptimized. /// /// - /// A positive value if iteration completed because it reached a deadend state or nullable state and the call is an isMatch. - /// 0 if iteration completed because we reached an initial state. - /// A negative value if iteration completed because we ran out of input or we failed to transition. + /// true if all input has been explored and there's no further work to be done; false if there's more input to explore and/or + /// we need to transition from DFA mode to NFA mode. /// private bool FindEndPositionDeltasDFA(ReadOnlySpan input, int length, RegexRunnerMode mode, - long timeoutOccursAt, ref int posRef, ref CurrentState state, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef) + long timeoutOccursAt, ref int posRef, ref CurrentState state, ref int endPosRef) where TStateHandler : struct, IStateHandler where TFindOptimizationsHandler : struct, IInitialStateHandler where TNullabilityHandler : struct, INullabilityHandler @@ -760,8 +755,6 @@ private bool FindEndPositionDeltasDFA private bool FindEndPositionDeltasNFA( ReadOnlySpan input, int length, RegexRunnerMode mode, long timeoutOccursAt, - ref int posRef, ref CurrentState state, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef) + ref int posRef, ref CurrentState state, ref int endPosRef) where TStateHandler : struct, IStateHandler where TNullabilityHandler : struct, INullabilityHandler { // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning. int pos = posRef; int endPos = endPosRef; - int initialStatePos = initialStatePosRef; - int initialStatePosCandidate = initialStatePosCandidateRef; try { // Loop through each character in the input, transitioning from state to state for each. @@ -870,7 +854,6 @@ private bool FindEndPositionDeltasNFA( if (TNullabilityHandler.IsNullableAt(this, in state, positionId, flags)) { endPos = pos; - initialStatePos = initialStatePosCandidate; // A match is known to exist. If that's all we need to know, we're done. if (mode == RegexRunnerMode.ExistenceRequired) @@ -894,8 +877,6 @@ private bool FindEndPositionDeltasNFA( // Write back the local copies of the ref values. posRef = pos; endPosRef = endPos; - initialStatePosRef = initialStatePos; - initialStatePosCandidateRef = initialStatePosCandidate; } } From b242f8e7473543b49110e48fc5e13265867f9d51 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Thu, 11 Jul 2024 15:36:39 -0400 Subject: [PATCH 10/15] Remove unnecessary generic args and remove resulting dead code Multiple XxDfa / XxNfa methods took a TStateHandler, but it was only ever DfaStateHandler for XxDfa or NfaStateHandler for XxNfa. We can just use the types directly in those methods, rather than generically parameterizing. Doing that revealed all but one of the members of IStateHandler weren't needed on the interface. And removing those revealed a bunch of dead code on DfaStateHandler/NfaStateHandler, which were removed, as well as arguments to some methods that weren't used. --- .../Symbolic/SymbolicRegexMatcher.cs | 126 ++++-------------- 1 file changed, 29 insertions(+), 97 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 9ff2d510f0e99a..5299b9b30d3aca 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -513,8 +513,8 @@ private int FindEndPositionOptimized NfaCharsPerTimeoutCheck ? pos + NfaCharsPerTimeoutCheck : input.Length; - done = FindEndPositionDeltasNFA( - input, innerLoopLength, mode, timeoutOccursAt, ref pos, + done = FindEndPositionDeltasNFA( + input, innerLoopLength, mode, ref pos, ref currentState, ref endPos); } @@ -578,16 +578,16 @@ private int FindEndPositionFallback DfaCharsPerTimeoutCheck ? pos + DfaCharsPerTimeoutCheck : input.Length; - done = FindEndPositionDeltasDFA( - input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState, ref endPos); + done = FindEndPositionDeltasDFA( + input, innerLoopLength, mode, ref pos, ref currentState, ref endPos); } else { // NFA fallback check, assume \Z and full nullability for NFA since it's already extremely rare to get here. const int NfaCharsPerTimeoutCheck = 1_000; innerLoopLength = _checkTimeout && input.Length - pos > NfaCharsPerTimeoutCheck ? pos + NfaCharsPerTimeoutCheck : input.Length; - done = FindEndPositionDeltasNFA( - input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState, ref endPos); + done = FindEndPositionDeltasNFA( + input, innerLoopLength, mode, ref pos, ref currentState, ref endPos); } // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or @@ -735,20 +735,12 @@ private bool FindEndPositionDeltasDFAOptimized, for each character transitioning from one state in the DFA or NFA graph to the next state, /// lazily building out the graph as needed. /// - /// - /// The supplies the actual transitioning logic, controlling whether processing is - /// performed in DFA mode or in NFA mode. However, it expects to be configured to match, - /// so for example if is a , it expects the 's - /// to be non-negative and its to be null; vice versa for - /// . - /// /// /// true if all input has been explored and there's no further work to be done; false if there's more input to explore and/or /// we need to transition from DFA mode to NFA mode. /// - private bool FindEndPositionDeltasDFA(ReadOnlySpan input, int length, RegexRunnerMode mode, - long timeoutOccursAt, ref int posRef, ref CurrentState state, ref int endPosRef) - where TStateHandler : struct, IStateHandler + private bool FindEndPositionDeltasDFA(ReadOnlySpan input, int length, RegexRunnerMode mode, + ref int posRef, ref CurrentState stateRef, ref int endPosRef) where TFindOptimizationsHandler : struct, IInitialStateHandler where TNullabilityHandler : struct, INullabilityHandler { @@ -756,6 +748,7 @@ private bool FindEndPositionDeltasDFA(this, in state, positionId, TStateHandler.GetStateFlags(this, in state))) + if (TNullabilityHandler.IsNullableAt(this, in state, positionId, DfaStateHandler.GetStateFlags(this, in state))) { endPos = pos; @@ -791,7 +783,7 @@ private bool FindEndPositionDeltasDFA= length || !TStateHandler.TryTakeTransition(this, ref state, positionId, timeoutOccursAt)) + if (pos >= length || !DfaStateHandler.TryTakeTransition(this, ref state, positionId)) { return false; } @@ -805,6 +797,7 @@ private bool FindEndPositionDeltasDFA, for each character transitioning from one state in the DFA or NFA graph to the next state, /// lazily building out the graph as needed. /// - /// - /// The supplies the actual transitioning logic, controlling whether processing is - /// performed in DFA mode or in NFA mode. However, it expects to be configured to match, - /// so for example if is a , it expects the 's - /// to be non-negative and its to be null; vice versa for - /// . - /// /// /// A positive value if iteration completed because it reached a deadend state or nullable state and the call is an isMatch. /// 0 if iteration completed because we reached an initial state. /// A negative value if iteration completed because we ran out of input or we failed to transition. /// - private bool FindEndPositionDeltasNFA( - ReadOnlySpan input, int length, RegexRunnerMode mode, long timeoutOccursAt, + private bool FindEndPositionDeltasNFA( + ReadOnlySpan input, int length, RegexRunnerMode mode, ref int posRef, ref CurrentState state, ref int endPosRef) - where TStateHandler : struct, IStateHandler where TNullabilityHandler : struct, INullabilityHandler { // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning. @@ -839,7 +824,7 @@ private bool FindEndPositionDeltasNFA( // Loop through each character in the input, transitioning from state to state for each. while (true) { - StateFlags flags = TStateHandler.GetStateFlags(this, in state); + StateFlags flags = NfaStateHandler.GetStateFlags(this, in state); // Dead end here means the set is empty if (state.NfaState!.NfaStateSet.Count == 0) @@ -851,7 +836,7 @@ private bool FindEndPositionDeltasNFA( // If the state is nullable for the next character, meaning it accepts the empty string, // we found a potential end state. - if (TNullabilityHandler.IsNullableAt(this, in state, positionId, flags)) + if (TNullabilityHandler.IsNullableAt(this, in state, positionId, flags)) { endPos = pos; @@ -863,7 +848,7 @@ private bool FindEndPositionDeltasNFA( } // If there is more input available try to transition with the next character. - if (pos >= length || !TStateHandler.TryTakeTransition(this, ref state, positionId, timeoutOccursAt)) + if (pos >= length || !NfaStateHandler.TryTakeTransition(this, ref state, positionId)) { return false; } @@ -910,8 +895,8 @@ private int FindStartPosition(CurrentState st { // Run the DFA or NFA traversal backwards from the current point using the current state. bool done = currentState.NfaState is not null ? - FindStartPositionDeltasNFA(input, ref i, matchStartBoundary, ref currentState, ref lastStart) : - FindStartPositionDeltasDFA(input, ref i, matchStartBoundary, ref currentState, ref lastStart); + FindStartPositionDeltasNFA(input, ref i, matchStartBoundary, ref currentState, ref lastStart) : + FindStartPositionDeltasDFA(input, ref i, matchStartBoundary, ref currentState, ref lastStart); // If we found the starting position, we're done. if (done) @@ -937,9 +922,8 @@ private int FindStartPosition(CurrentState st /// starting at , for each character transitioning from one state in the DFA or NFA graph to the next state, /// lazily building out the graph as needed. /// - private bool FindStartPositionDeltasDFA( + private bool FindStartPositionDeltasDFA( ReadOnlySpan input, ref int i, int startThreshold, ref CurrentState state, ref int lastStart) - where TStateHandler : struct, IStateHandler where TInputReader : struct, IInputReader where TNullabilityHandler : struct, INullabilityHandler { @@ -955,7 +939,7 @@ private bool FindStartPositionDeltasDFA 0 && - TNullabilityHandler.IsNullableAt(this, in state, positionId, TStateHandler.GetStateFlags(this, in state))) + TNullabilityHandler.IsNullableAt(this, in state, positionId, DfaStateHandler.GetStateFlags(this, in state))) { lastStart = pos; } @@ -969,7 +953,7 @@ private bool FindStartPositionDeltasDFA(ReadOnlySpan input, ref int i, int startThreshold, ref CurrentState state, ref int lastStart) - where TStateHandler : struct, IStateHandler + private bool FindStartPositionDeltasNFA(ReadOnlySpan input, ref int i, int startThreshold, ref CurrentState state, ref int lastStart) where TInputReader : struct, IInputReader where TNullabilityHandler : struct, INullabilityHandler { @@ -1002,7 +985,7 @@ private bool FindStartPositionDeltasNFA(this, in state, positionId, TStateHandler.GetStateFlags(this, in state))) + if (TNullabilityHandler.IsNullableAt(this, in state, positionId, NfaStateHandler.GetStateFlags(this, in state))) { lastStart = pos; } @@ -1016,7 +999,7 @@ private bool FindStartPositionDeltasNFARepresents a set of routines for operating over a . private interface IStateHandler { - public static abstract bool StartsWithLineAnchor(SymbolicRegexMatcher matcher, in CurrentState state); public static abstract bool IsNullableFor(SymbolicRegexMatcher matcher, in CurrentState state, uint nextCharKind); - public static abstract int ExtractNullableCoreStateId(SymbolicRegexMatcher matcher, in CurrentState state, ReadOnlySpan input, int pos); - public static abstract int FixedLength(SymbolicRegexMatcher matcher, in CurrentState state, uint nextCharKind); - public static abstract bool TryTakeTransition(SymbolicRegexMatcher matcher, ref CurrentState state, int mintermId, long timeoutOccursAt); - public static abstract StateFlags GetStateFlags(SymbolicRegexMatcher matcher, in CurrentState state); } /// An for operating over instances configured as DFA states. private readonly struct DfaStateHandler : IStateHandler { - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool StartsWithLineAnchor(SymbolicRegexMatcher matcher, in CurrentState state) => matcher.GetState(state.DfaStateId).StartsWithLineAnchor; - [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool IsNullableFor(SymbolicRegexMatcher matcher, in CurrentState state, uint nextCharKind) => matcher._nullabilityArray[state.DfaStateId] > 0 && ((byte)(1 << (int)nextCharKind) & matcher._nullabilityArray[state.DfaStateId]) > 0; - /// Gets the preferred DFA state for nullability. In DFA mode this is just the state itself. - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static int ExtractNullableCoreStateId(SymbolicRegexMatcher matcher, in CurrentState state, ReadOnlySpan input, int pos) => state.DfaStateId; - - /// Gets the length of any fixed-length marker that exists for this state, or -1 if there is none. - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static int FixedLength(SymbolicRegexMatcher matcher, in CurrentState state, uint nextCharKind) => matcher.GetState(state.DfaStateId).FixedLength(nextCharKind); - /// Take the transition to the next DFA state. [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool TryTakeTransition(SymbolicRegexMatcher matcher, ref CurrentState state, int mintermId, - long timeoutOccursAt) + public static bool TryTakeTransition(SymbolicRegexMatcher matcher, ref CurrentState state, int mintermId) { Debug.Assert(state.DfaStateId > 0, $"Expected non-zero {nameof(state.DfaStateId)}."); Debug.Assert(state.NfaState is null, $"Expected null {nameof(state.NfaState)}."); @@ -1423,42 +1389,8 @@ public static bool IsNullableFor(SymbolicRegexMatcher matcher, in CurrentS return false; } - /// Gets the preferred DFA state for nullability. In DFA mode this is just the state itself. - public static int ExtractNullableCoreStateId(SymbolicRegexMatcher matcher, in CurrentState state, ReadOnlySpan input, int pos) - { - uint nextCharKind = matcher.GetCharKind(input, pos); - foreach (ref KeyValuePair nfaState in CollectionsMarshal.AsSpan(state.NfaState!.NfaStateSet.Values)) - { - MatchingState coreState = matcher.GetState(matcher.GetCoreStateId(nfaState.Key)); - if (coreState.IsNullableFor(nextCharKind)) - { - return coreState.Id; - } - } - - Debug.Fail("ExtractNullableCoreStateId should only be called in nullable state/context."); - return -1; - } - - /// Gets the length of any fixed-length marker that exists for this state, or -1 if there is none. - public static int FixedLength(SymbolicRegexMatcher matcher, in CurrentState state, uint nextCharKind) - { - foreach (ref KeyValuePair nfaState in CollectionsMarshal.AsSpan(state.NfaState!.NfaStateSet.Values)) - { - MatchingState coreState = matcher.GetState(matcher.GetCoreStateId(nfaState.Key)); - if (coreState.IsNullableFor(nextCharKind)) - { - return coreState.FixedLength(nextCharKind); - } - } - - Debug.Fail("FixedLength should only be called in nullable state/context."); - return -1; - } - /// Take the transition to the next NFA state. - public static bool TryTakeTransition(SymbolicRegexMatcher matcher, ref CurrentState state, int mintermId, - long timeoutOccursAt = 0) + public static bool TryTakeTransition(SymbolicRegexMatcher matcher, ref CurrentState state, int mintermId) { Debug.Assert(state.DfaStateId < 0, $"Expected negative {nameof(state.DfaStateId)}."); Debug.Assert(state.NfaState is not null, $"Expected non-null {nameof(state.NfaState)}."); From 401b65ae075134307f5b0206694e271ad4b1941b Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Thu, 11 Jul 2024 16:52:22 -0400 Subject: [PATCH 11/15] Put GetStateFlags back in IStateHandler and use it to avoid duplication at call sites --- .../Symbolic/SymbolicRegexMatcher.cs | 47 ++++++++++--------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 5299b9b30d3aca..169b8373c7e182 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -437,8 +437,7 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i if (_containsAnyAnchor && _nullabilityArray[reversalStartState.DfaStateId] > 0 && DefaultNullabilityHandler.IsNullableAt( - this, in reversalStartState, DefaultInputReader.GetPositionId(this, input, i), - DfaStateHandler.GetStateFlags(this, in reversalStartState))) + this, in reversalStartState, DefaultInputReader.GetPositionId(this, input, i))) { initialLastStart = i; } @@ -771,7 +770,7 @@ private bool FindEndPositionDeltasDFA(this, in state, positionId, DfaStateHandler.GetStateFlags(this, in state))) + if (TNullabilityHandler.IsNullableAt(this, in state, positionId)) { endPos = pos; @@ -783,7 +782,7 @@ private bool FindEndPositionDeltasDFA= length || !DfaStateHandler.TryTakeTransition(this, ref state, positionId)) + if (pos >= length || !DfaStateHandler.TryTakeTransition(this, ref state.DfaStateId, positionId)) { return false; } @@ -824,8 +823,6 @@ private bool FindEndPositionDeltasNFA( // Loop through each character in the input, transitioning from state to state for each. while (true) { - StateFlags flags = NfaStateHandler.GetStateFlags(this, in state); - // Dead end here means the set is empty if (state.NfaState!.NfaStateSet.Count == 0) { @@ -836,7 +833,7 @@ private bool FindEndPositionDeltasNFA( // If the state is nullable for the next character, meaning it accepts the empty string, // we found a potential end state. - if (TNullabilityHandler.IsNullableAt(this, in state, positionId, flags)) + if (TNullabilityHandler.IsNullableAt(this, in state, positionId)) { endPos = pos; @@ -938,8 +935,8 @@ private bool FindStartPositionDeltasDFA( // If the state accepts the empty string, we found a valid starting position. Record it and keep going, // since we're looking for the earliest one to occur within bounds. - if (_nullabilityArray[state.DfaStateId] > 0 && - TNullabilityHandler.IsNullableAt(this, in state, positionId, DfaStateHandler.GetStateFlags(this, in state))) + if (_nullabilityArray[state.DfaStateId] != 0 && + TNullabilityHandler.IsNullableAt(this, in state, positionId)) { lastStart = pos; } @@ -953,7 +950,7 @@ private bool FindStartPositionDeltasDFA( } // Try to transition with the next character, the one before the current position. - if (!DfaStateHandler.TryTakeTransition(this, ref state, positionId)) + if (!DfaStateHandler.TryTakeTransition(this, ref state.DfaStateId, positionId)) { // Return false to indicate the search didn't finish. return false; @@ -985,7 +982,7 @@ private bool FindStartPositionDeltasNFA(ReadO // If the state accepts the empty string, we found a valid starting position. Record it and keep going, // since we're looking for the earliest one to occur within bounds. - if (TNullabilityHandler.IsNullableAt(this, in state, positionId, NfaStateHandler.GetStateFlags(this, in state))) + if (TNullabilityHandler.IsNullableAt(this, in state, positionId)) { lastStart = pos; } @@ -1274,6 +1271,7 @@ public CurrentState(NfaMatchingState nfaState) private interface IStateHandler { public static abstract bool IsNullableFor(SymbolicRegexMatcher matcher, in CurrentState state, uint nextCharKind); + public static abstract StateFlags GetStateFlags(SymbolicRegexMatcher matcher, in CurrentState state); } /// An for operating over instances configured as DFA states. @@ -1285,29 +1283,28 @@ public static bool IsNullableFor(SymbolicRegexMatcher matcher, in CurrentS /// Take the transition to the next DFA state. [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool TryTakeTransition(SymbolicRegexMatcher matcher, ref CurrentState state, int mintermId) + public static bool TryTakeTransition(SymbolicRegexMatcher matcher, ref int dfaStateId, int mintermId) { - Debug.Assert(state.DfaStateId > 0, $"Expected non-zero {nameof(state.DfaStateId)}."); - Debug.Assert(state.NfaState is null, $"Expected null {nameof(state.NfaState)}."); + Debug.Assert(dfaStateId > 0, $"Expected non-zero {nameof(dfaStateId)}."); // Use the mintermId for the character being read to look up which state to transition to. // If that state has already been materialized, move to it, and we're done. If that state // hasn't been materialized, try to create it; if we can, move to it, and we're done. - int dfaOffset = matcher.DeltaOffset(state.DfaStateId, mintermId); + int dfaOffset = matcher.DeltaOffset(dfaStateId, mintermId); int nextStateId = matcher._dfaDelta[dfaOffset]; if (nextStateId > 0) { // There was an existing DFA transition to some state. Move to it and // return that we're still operating as a DFA and can keep going. - state.DfaStateId = nextStateId; + dfaStateId = nextStateId; return true; } - if (matcher.TryCreateNewTransition(matcher.GetState(state.DfaStateId), mintermId, dfaOffset, checkThreshold: true, out MatchingState? nextState)) + if (matcher.TryCreateNewTransition(matcher.GetState(dfaStateId), mintermId, dfaOffset, checkThreshold: true, out MatchingState? nextState)) { // We were able to create a new DFA transition to some state. Move to it and // return that we're still operating as a DFA and can keep going. - state.DfaStateId = nextState.Id; + dfaStateId = nextState.Id; return true; } @@ -1657,7 +1654,7 @@ private interface INullabilityHandler { /// Gets whether the specified position is nullable. public static abstract bool IsNullableAt( - SymbolicRegexMatcher matcher, in CurrentState state, int positionId, StateFlags flags) + SymbolicRegexMatcher matcher, in CurrentState state, int positionId) where TStateHandler : struct, IStateHandler; } @@ -1666,10 +1663,14 @@ public static abstract bool IsNullableAt( { /// Gets whether the specified position is nullable. [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool IsNullableAt(SymbolicRegexMatcher matcher, in CurrentState state, int positionId, StateFlags flags) - where TStateHandler : struct, IStateHandler => + public static bool IsNullableAt(SymbolicRegexMatcher matcher, in CurrentState state, int positionId) + where TStateHandler : struct, IStateHandler + { + var flags = TStateHandler.GetStateFlags(matcher, in state); + return flags.IsNullable() || (flags.CanBeNullable() && TStateHandler.IsNullableFor(matcher, in state, matcher.GetPositionKind(positionId))); + } } /// Nullability handler for patterns without any anchors. @@ -1677,11 +1678,11 @@ public static bool IsNullableAt(SymbolicRegexMatcher matche { /// Gets whether the specified position is nullable. [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool IsNullableAt(SymbolicRegexMatcher matcher, in CurrentState state, int positionId, StateFlags flags) + public static bool IsNullableAt(SymbolicRegexMatcher matcher, in CurrentState state, int positionId) where TStateHandler : struct, IStateHandler { Debug.Assert(!matcher._pattern._info.ContainsSomeAnchor); - return flags.IsNullable(); + return TStateHandler.GetStateFlags(matcher, in state).IsNullable(); } } From adae5e2ce206af49afaafe62cbb17209ea0faef1 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Thu, 11 Jul 2024 17:25:25 -0400 Subject: [PATCH 12/15] Put out argument last in TryCreateNewTransition --- .../Symbolic/SymbolicRegexMatcher.Automata.cs | 4 +--- .../Symbolic/SymbolicRegexMatcher.Explore.cs | 8 +++++++- .../RegularExpressions/Symbolic/SymbolicRegexMatcher.cs | 9 +++------ 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs index 8c3960a6fb3502..e192bb4b50959b 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs @@ -355,9 +355,7 @@ private int GetCoreStateId(int nfaStateId) /// Gets or creates a new DFA transition. /// This function locks the matcher for safe concurrent use of the - private bool TryCreateNewTransition( - MatchingState sourceState, int mintermId, int offset, bool checkThreshold, [NotNullWhen(true)] out MatchingState? nextState, - long timeoutOccursAt = 0) + private bool TryCreateNewTransition(MatchingState sourceState, int mintermId, int offset, bool checkThreshold, long timeoutOccursAt, [NotNullWhen(true)] out MatchingState? nextState) { Debug.Assert(offset < _dfaDelta.Length); lock (this) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Explore.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Explore.cs index d30a2d9d02e3f3..f73bf610446a53 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Explore.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Explore.cs @@ -35,16 +35,22 @@ public override void Explore(bool includeDotStarred, bool includeReverse, bool i { // Don't dequeue yet, because a transition might fail MatchingState state = toExplore.Peek(); + // Include the special minterm for the last end-of-line if the state is sensitive to it int maxMinterm = state.StartsWithLineAnchor ? _minterms!.Length : _minterms!.Length - 1; + // Explore successor states for each minterm for (int mintermId = 0; mintermId <= maxMinterm; ++mintermId) { int offset = DeltaOffset(state.Id, mintermId); - if (!TryCreateNewTransition(state, mintermId, offset, true, out MatchingState? nextState)) + if (!TryCreateNewTransition(state, mintermId, offset, true, 0, out MatchingState? nextState)) + { goto DfaLimitReached; + } + EnqueueIfUnseen(nextState, seen, toExplore); } + // Safe to dequeue now that the state has been completely handled toExplore.Dequeue(); } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 169b8373c7e182..3e0684766c2f3e 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -1300,7 +1300,7 @@ public static bool TryTakeTransition(SymbolicRegexMatcher matcher, ref int return true; } - if (matcher.TryCreateNewTransition(matcher.GetState(dfaStateId), mintermId, dfaOffset, checkThreshold: true, out MatchingState? nextState)) + if (matcher.TryCreateNewTransition(matcher.GetState(dfaStateId), mintermId, dfaOffset, checkThreshold: true, timeoutOccursAt: 0, out MatchingState? nextState)) { // We were able to create a new DFA transition to some state. Move to it and // return that we're still operating as a DFA and can keep going. @@ -1313,8 +1313,7 @@ public static bool TryTakeTransition(SymbolicRegexMatcher matcher, ref int /// Transition function that only considers DFA state id [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static bool TryTakeDFATransition(SymbolicRegexMatcher matcher, ref int state, - int mintermId, long timeoutOccursAt) + internal static bool TryTakeDFATransition(SymbolicRegexMatcher matcher, ref int state, int mintermId, long timeoutOccursAt) { Debug.Assert(state > 0, $"Expected {nameof(state)} {state} > 0"); @@ -1330,9 +1329,7 @@ internal static bool TryTakeDFATransition(SymbolicRegexMatcher matcher, re return true; } - if (matcher.TryCreateNewTransition(matcher.GetState(state), mintermId, - matcher.DeltaOffset(state, mintermId), - checkThreshold: true, out MatchingState? nextState, timeoutOccursAt)) + if (matcher.TryCreateNewTransition(matcher.GetState(state), mintermId, matcher.DeltaOffset(state, mintermId), checkThreshold: true, timeoutOccursAt, out MatchingState? nextState)) { // We were able to create a new DFA transition to some state. Move to it and // return that we're still operating as a DFA and can keep going. From 2d93daa7206c3592f019052c6ea46661d0debe95 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Thu, 11 Jul 2024 17:27:12 -0400 Subject: [PATCH 13/15] Store state to local in FindStartPositionDeltasDFA --- .../Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 3e0684766c2f3e..02edf56a073de7 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -920,12 +920,13 @@ private int FindStartPosition(CurrentState st /// lazily building out the graph as needed. /// private bool FindStartPositionDeltasDFA( - ReadOnlySpan input, ref int i, int startThreshold, ref CurrentState state, ref int lastStart) + ReadOnlySpan input, ref int i, int startThreshold, ref CurrentState stateRef, ref int lastStart) where TInputReader : struct, IInputReader where TNullabilityHandler : struct, INullabilityHandler { // To avoid frequent reads/writes to ref values, make and operate on local copies, which we then copy back once before returning. int pos = i; + CurrentState state = stateRef; try { // Loop backwards through each character in the input, transitioning from state to state for each. @@ -963,6 +964,7 @@ private bool FindStartPositionDeltasDFA( finally { // Write back the local copies of the ref values. + stateRef = state; i = pos; } } From 9f614258c4913cf7993c3c1636d412dbfe83aae7 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Thu, 11 Jul 2024 18:28:47 -0400 Subject: [PATCH 14/15] Merge IAcceleratedStateHandler into IInitialStateHandler --- .../Symbolic/SymbolicRegexMatcher.cs | 170 ++++++++++-------- 1 file changed, 94 insertions(+), 76 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 02edf56a073de7..7d002191d8267a 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -379,17 +379,17 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i // Optimize processing for the common case of no Z anchor and <= 255 minterms. Specialize each call with different generic method arguments. matchEnd = (_findOpts is not null, _containsAnyAnchor) switch { - (true, true) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), - (true, false) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), - (false, false) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), - (false, true) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), + (true, true) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), + (true, false) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), + (false, true) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), + (false, false) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), }; } else { // Fallback for Z anchor or over 255 minterms matchEnd = _findOpts is not null ? - FindEndPositionFallback(input, startat, timeoutOccursAt, mode, perThreadData) : + FindEndPositionFallback(input, startat, timeoutOccursAt, mode, perThreadData) : FindEndPositionFallback(input, startat, timeoutOccursAt, mode, perThreadData); } @@ -436,8 +436,7 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i // reversal may already be nullable here in the case of anchors if (_containsAnyAnchor && _nullabilityArray[reversalStartState.DfaStateId] > 0 && - DefaultNullabilityHandler.IsNullableAt( - this, in reversalStartState, DefaultInputReader.GetPositionId(this, input, i))) + DefaultNullabilityHandler.IsNullableAt(this, in reversalStartState, DefaultInputReader.GetPositionId(this, input, i))) { initialLastStart = i; } @@ -485,9 +484,9 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i /// /// Streamlined version of that doesn't handle /z anchors or very large sets of minterms. /// - private int FindEndPositionOptimized( + private int FindEndPositionOptimized( ReadOnlySpan input, int pos, long timeoutOccursAt, RegexRunnerMode mode, PerThreadData perThreadData) - where TAcceleratedStateHandler : struct, IAcceleratedStateHandler + where TInitialStateHandler : struct, IInitialStateHandler where TOptimizedNullabilityHandler : struct, IDfaNoZAnchorOptimizedNullabilityHandler { // Initial state candidate. @@ -503,7 +502,7 @@ private int FindEndPositionOptimized DfaCharsPerTimeoutCheck ? pos + DfaCharsPerTimeoutCheck : lengthMinus1; - done = FindEndPositionDeltasDFAOptimized( + done = FindEndPositionDeltasDFAOptimized( input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState.DfaStateId, ref endPos); } @@ -554,8 +553,8 @@ private int FindEndPositionOptimized /// A one-past-the-end index into input for the preferred match, or first final state position if isMatch is true, or NoMatchExists if no match exists. /// - private int FindEndPositionFallback(ReadOnlySpan input, int pos, long timeoutOccursAt, RegexRunnerMode mode, PerThreadData perThreadData) - where TFindOptimizationsHandler : struct, IInitialStateHandler + private int FindEndPositionFallback(ReadOnlySpan input, int pos, long timeoutOccursAt, RegexRunnerMode mode, PerThreadData perThreadData) + where TInitialStateHandler : struct, IInitialStateHandler where TNullabilityHandler : struct, INullabilityHandler { var currentState = new CurrentState(_dotstarredInitialStates[GetCharKind(input, pos - 1)]); @@ -577,7 +576,7 @@ private int FindEndPositionFallback DfaCharsPerTimeoutCheck ? pos + DfaCharsPerTimeoutCheck : input.Length; - done = FindEndPositionDeltasDFA( + done = FindEndPositionDeltasDFA( input, innerLoopLength, mode, ref pos, ref currentState, ref endPos); } else @@ -623,10 +622,10 @@ private int FindEndPositionFallback - private bool FindEndPositionDeltasDFAOptimized( + private bool FindEndPositionDeltasDFAOptimized( ReadOnlySpan input, int lengthMinus1, RegexRunnerMode mode, long timeoutOccursAt, ref int posRef, ref int currentStateIdRef, ref int endPosRef) - where TAcceleratedStateHandler : struct, IAcceleratedStateHandler + where TInitialStateHandler : struct, IInitialStateHandler where TOptimizedNullabilityHandler : struct, IDfaNoZAnchorOptimizedNullabilityHandler { // Initial check for input end lifted out of the subsequent hot-path loop. @@ -645,7 +644,7 @@ private bool FindEndPositionDeltasDFAOptimized= lengthMinus1) { if (pos + 1 < input.Length) @@ -738,9 +737,9 @@ private bool FindEndPositionDeltasDFAOptimized - private bool FindEndPositionDeltasDFA(ReadOnlySpan input, int length, RegexRunnerMode mode, + private bool FindEndPositionDeltasDFA(ReadOnlySpan input, int length, RegexRunnerMode mode, ref int posRef, ref CurrentState stateRef, ref int endPosRef) - where TFindOptimizationsHandler : struct, IInitialStateHandler + where TInitialStateHandler : struct, IInitialStateHandler where TNullabilityHandler : struct, INullabilityHandler { // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning. @@ -755,13 +754,8 @@ private bool FindEndPositionDeltasDFA matcher, in CurrentState state, - uint nextCharKind) => matcher._nullabilityArray[state.DfaStateId] > 0 && ((byte)(1 << (int)nextCharKind) & matcher._nullabilityArray[state.DfaStateId]) > 0; + public static bool IsNullableFor(SymbolicRegexMatcher matcher, in CurrentState state, uint nextCharKind) => + matcher._nullabilityArray[state.DfaStateId] > 0 && + ((byte)(1 << (int)nextCharKind) & matcher._nullabilityArray[state.DfaStateId]) > 0; /// Take the transition to the next DFA state. [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -1462,7 +1457,7 @@ static int[] GetNextStates(int sourceState, int mintermId, SymbolicRegexMatcher< /// In NFA mode: /// - an empty set of states means that it is a dead end /// - no set of states qualifies as an initial state. This could be made more accurate, but with that the - /// matching logic would need to be updated to handle the fact that + /// matching logic would need to be updated to handle the fact that /// can transition back to a DFA state. /// [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -1551,36 +1546,51 @@ public static int GetPositionId(SymbolicRegexMatcher matcher, ReadOnlySpan -1; } - /// Represents a handler used to determine the next possible starting position. + /// Represents a handler used to determine the next possible matching position from an initial state. private interface IInitialStateHandler { + /// Gets whether the handler performs any meaningful operation. If false, always returns true. + /// + /// This should be implemented to always return a constant true or false. The consumer will inline it and, if this is false, can dead-code eliminate + /// anything guarded by the condition. + /// + public static abstract bool IsOptimized { get; } + /// Gets the next viable starting position. - /// true if a viable starting position was found; false if no further possible match exists. + /// true if a possible match location is found; false if no match is possible anywhere in the remaining input. + /// This may be used if is false but it will then always return true indicating that the current position may be viable. public static abstract bool TryFindNextStartingPosition( - SymbolicRegexMatcher matcher, ReadOnlySpan input, ref int currentStateId, ref int pos); + SymbolicRegexMatcher matcher, ReadOnlySpan input, ref int currentStateId, ref int pos, byte[]? lookup); } /// Provides an initial state handler for when there are no initial state optimizations to apply. private readonly struct NoOptimizationsInitialStateHandler : IInitialStateHandler { + /// Returns false. + public static bool IsOptimized => false; + /// Returns true. No optimizations are known to be able to skip states, thus every position is a viable starting position. - [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool TryFindNextStartingPosition( - SymbolicRegexMatcher matcher, ReadOnlySpan input, ref int currentStateId, ref int pos) => + SymbolicRegexMatcher matcher, ReadOnlySpan input, ref int currentStateId, ref int pos, byte[]? lookup) => true; } /// Provides a handler that uses the matcher's to optimize searching for the next viable starting state. - private readonly struct InitialStateFindOptimizationsHandler : IInitialStateHandler + private readonly struct FindOptimizationsInitialStateHandler : IInitialStateHandler { + /// Returns true. + public static bool IsOptimized => true; + /// Gets the next viable starting position. /// true if a viable starting position was found; false if no further possible match exists. [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool TryFindNextStartingPosition( - SymbolicRegexMatcher matcher, ReadOnlySpan input, ref int currentStateId, ref int pos) + SymbolicRegexMatcher matcher, ReadOnlySpan input, ref int currentStateId, ref int pos, byte[]? lookup) { + Debug.Assert(matcher._findOpts is not null); + // Find the first position that matches with some likely character. - if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) + if (matcher._findOpts.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) { // Update the starting state based on where TryFindNextStartingPosition moved us to. // As with the initial starting state, if it's a dead end, no match exists. @@ -1589,62 +1599,70 @@ public static bool TryFindNextStartingPosition( } // No match exists + Debug.Assert(pos == input.Length); + currentStateId = matcher._deadStateId; return false; } } - /// - /// Interface for accelerated states, returns true if position was changed - /// - private interface IAcceleratedStateHandler - { - public static abstract bool TryFindNextStartingPosition( - SymbolicRegexMatcher matcher, ReadOnlySpan input, ref int currentStateId, ref int pos, byte[] lookup); - } - - private readonly struct NoOptimizationsAcceleratedStateHandler : IAcceleratedStateHandler + /// Provides a handler that uses the matcher's to optimize searching for the next viable starting state. + /// This implementation works only when there are no /Z anchors in the pattern. + private readonly struct NoZAnchorFindOptimizationsInitialStateHandler : IInitialStateHandler { - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool TryFindNextStartingPosition( - SymbolicRegexMatcher matcher, ReadOnlySpan input, ref int currentStateId, ref int pos, byte[] lookup) => - false; - } + /// Returns true. + public static bool IsOptimized => true; - private readonly struct RegexFindOpsAcceleratedStateHandler : IAcceleratedStateHandler - { + /// Gets the next viable starting position. + /// true if a viable starting position was found; false if no further possible match exists. [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool TryFindNextStartingPosition( - SymbolicRegexMatcher matcher, ReadOnlySpan input, ref int currentStateId, ref int pos, byte[] lookup) + SymbolicRegexMatcher matcher, ReadOnlySpan input, ref int currentStateId, ref int pos, byte[]? lookup) { - if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) + Debug.Assert(matcher._findOpts is not null); + Debug.Assert(lookup is not null, $"{nameof(NoZAnchorFindOptimizationsInitialStateHandler)} must only be used with call sites that pass non-null {nameof(lookup)}."); + + if (matcher._findOpts.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) { + // Update the starting state based on where TryFindNextStartingPosition moved us to. + // This is an optimized version of the update in FindOptimizationsInitialStateHandler that doesn't need to consider the possibility of /Z anchors. currentStateId = matcher._dotstarredInitialStates[matcher._positionKinds[GetMintermId(lookup, input[pos - 1]) + 1]].Id; - } - else - { - // No match exists - currentStateId = matcher._deadStateId; - pos = input.Length; + return true; } - return true; + // No match exists + Debug.Assert(pos == input.Length); + currentStateId = matcher._deadStateId; + return false; } } - private readonly struct NoAnchorRegexFindOpsAcceleratedStateHandler : IAcceleratedStateHandler + /// Provides a handler that uses the matcher's to optimize searching for the next viable starting state. + /// This implementation works only when there are no anchors in the pattern. + private readonly struct NoAnchorsFindOptimizationsInitialStateHandler : IInitialStateHandler { + /// Returns true. + public static bool IsOptimized => true; + + /// Gets the next viable starting position. + /// true if a viable starting position was found; false if no further possible match exists. [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool TryFindNextStartingPosition( - SymbolicRegexMatcher matcher, ReadOnlySpan input, ref int currentStateId, ref int pos, byte[] lookup) + SymbolicRegexMatcher matcher, ReadOnlySpan input, ref int currentStateId, ref int pos, byte[]? lookup) { - if (!matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) + Debug.Assert(!matcher._containsAnyAnchor); + Debug.Assert(matcher._findOpts is not null); + Debug.Assert(currentStateId == matcher._initialStateId, "There are no anchors, so the current state should be the sole initial state."); + + if (matcher._findOpts.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) { - // No match exists - currentStateId = matcher._deadStateId; - pos = input.Length; + // There are no anchors, so there's only one starting state, so we don't need to update currentStateId that's already the starting state. + return true; } - return true; + // No match exists + Debug.Assert(pos == input.Length); + currentStateId = matcher._deadStateId; + return false; } } @@ -1665,10 +1683,10 @@ public static abstract bool IsNullableAt( public static bool IsNullableAt(SymbolicRegexMatcher matcher, in CurrentState state, int positionId) where TStateHandler : struct, IStateHandler { - var flags = TStateHandler.GetStateFlags(matcher, in state); + StateFlags flags = TStateHandler.GetStateFlags(matcher, in state); return - flags.IsNullable() || - (flags.CanBeNullable() && TStateHandler.IsNullableFor(matcher, in state, matcher.GetPositionKind(positionId))); + flags.IsNullable() || + (flags.CanBeNullable() && TStateHandler.IsNullableFor(matcher, in state, matcher.GetPositionKind(positionId))); } } From 4e760f19ef3100a65f2f43d9315f904782d0c682 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Thu, 11 Jul 2024 20:59:05 -0400 Subject: [PATCH 15/15] Remove MintermClassifier.IntLookup --- .../Text/RegularExpressions/Symbolic/MintermClassifier.cs | 6 ------ .../RegularExpressions/Symbolic/SymbolicRegexMatcher.cs | 2 +- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs index c336fe93ffe1fb..bf4735f2bf1459 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs @@ -116,12 +116,6 @@ public int GetMintermID(int c) /// public byte[]? ByteLookup => _lookup; - /// - /// Gets a mapping from char to minterm for the rare case when there are >= 255 minterms. - /// Null in the common case where there are fewer than 255 minterms. - /// - public int[]? IntLookup => _intLookup; - /// /// Maximum ordinal character for a non-0 minterm, used to conserve memory /// diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 7d002191d8267a..e67a72018dfd57 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -374,7 +374,7 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i // The Z anchor and over 255 minterms are rare enough to consider them separate edge cases. int matchEnd; - if (!_containsEndZAnchor && _mintermClassifier.IntLookup is null) + if (!_containsEndZAnchor && _mintermClassifier.ByteLookup is not null) { // Optimize processing for the common case of no Z anchor and <= 255 minterms. Specialize each call with different generic method arguments. matchEnd = (_findOpts is not null, _containsAnyAnchor) switch