diff --git a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj
index 2042b930fdd2c8..86353b31b5d7b7 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj
+++ b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj
@@ -72,6 +72,8 @@
+
+
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs
index 49205f5ee2649f..d2aec2621a81c8 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs
@@ -5,7 +5,6 @@
using System.Collections.Generic;
using System.Diagnostics;
using System.Runtime.CompilerServices;
-using System.Runtime.InteropServices;
#pragma warning disable CS8500 // takes address of managed type
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/BitVectorSolver.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/BitVectorSolver.cs
index 09db2948d717be..b30527871e2bb4 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/BitVectorSolver.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/BitVectorSolver.cs
@@ -10,11 +10,11 @@ internal sealed class BitVectorSolver : ISolver
internal readonly MintermClassifier _classifier;
private readonly BitVector[] _mintermVectors;
- public BitVectorSolver(BDD[] minterms, CharSetSolver solver)
+ public BitVectorSolver(BDD[] minterms)
{
_minterms = minterms;
- _classifier = new MintermClassifier(minterms, solver);
+ _classifier = new MintermClassifier(minterms);
var singleBitVectors = new BitVector[minterms.Length];
for (int i = 0; i < singleBitVectors.Length; i++)
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs
new file mode 100644
index 00000000000000..2ea1ea8af7422c
--- /dev/null
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs
@@ -0,0 +1,39 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Diagnostics;
+
+namespace System.Text.RegularExpressions.Symbolic
+{
+ /// Provides details on how a match may be processed in reverse to find the beginning of a match once a match's existence has been confirmed.
+ internal readonly struct MatchReversalInfo where TSet : IComparable, IEquatable
+ {
+ /// Initializes the match reversal details.
+ internal MatchReversalInfo(MatchReversalKind kind, int fixedLength, MatchingState? adjustedStartState = null)
+ {
+ Debug.Assert(kind is MatchReversalKind.MatchStart or MatchReversalKind.FixedLength or MatchReversalKind.PartialFixedLength);
+ Debug.Assert(fixedLength >= 0);
+ Debug.Assert((adjustedStartState is not null) == (kind is MatchReversalKind.PartialFixedLength));
+
+ Kind = kind;
+ FixedLength = fixedLength;
+ AdjustedStartState = adjustedStartState;
+ }
+
+ /// Gets the kind of the match reversal processing required.
+ internal MatchReversalKind Kind { get; }
+
+ /// Gets the fixed length of the match, if one is known.
+ ///
+ /// For , this is ignored.
+ /// For , this is the full length of the match. The beginning may be found simply
+ /// by subtracting this length from the end.
+ /// For , this is the length of fixed portion of the match.
+ ///
+ internal int FixedLength { get; }
+
+ /// Gets the adjusted start state to use for partial fixed-length matches.
+ /// This will be non-null iff is .
+ internal MatchingState? AdjustedStartState { get; }
+ }
+}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversalKind.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversalKind.cs
new file mode 100644
index 00000000000000..a949e6204a16a3
--- /dev/null
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversalKind.cs
@@ -0,0 +1,26 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+namespace System.Text.RegularExpressions.Symbolic
+{
+ /// Specifies the kind of a .
+ internal enum MatchReversalKind
+ {
+ /// The regex should be run in reverse to find beginning of the match.
+ MatchStart,
+
+ /// The end of the pattern is of a fixed length and can be skipped as part of running a regex in reverse to find the beginning of the match.
+ ///
+ /// Reverse execution is not necessary for a subset of the match.
+ /// will contain the length of the fixed portion.
+ ///
+ PartialFixedLength,
+
+ /// The entire pattern is of a fixed length.
+ ///
+ /// Reverse execution is not necessary to find the beginning of the match.
+ /// will contain the length of the match.
+ ///
+ FixedLength
+ }
+}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
index dce65a9996330d..3aacc4a61cbb94 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
@@ -14,6 +14,7 @@ internal MatchingState(SymbolicRegexNode node, uint prevCharKind)
{
Node = node;
PrevCharKind = prevCharKind;
+ NullabilityInfo = BuildNullabilityInfo();
}
/// The regular expression that labels this state and gives it its semantics.
@@ -95,21 +96,37 @@ internal SymbolicRegexNode Next(SymbolicRegexBuilder builder, TSet m
return Node.CreateNfaDerivativeWithEffects(builder, minterm, context);
}
+ /// Determines whether the node is nullable for the given context.
+ ///
+ /// This is functionally equivalent to , but using cached
+ /// answers stored in .
+ ///
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal bool IsNullableFor(uint nextCharKind)
{
- Debug.Assert(CharKind.IsValidCharKind(nextCharKind));
- uint context = CharKind.Context(PrevCharKind, nextCharKind);
- return Node.IsNullableFor(context);
+ Debug.Assert(nextCharKind is >= 0 and < CharKind.CharKindCount);
+ return (NullabilityInfo & (1 << (int)nextCharKind)) != 0;
}
+ /// Gets the nullability info for the matching state.
+ ///
+ ///
+ /// 00000 -> node cannot be nullable
+ /// 00001 -> nullable for General
+ /// 00010 -> nullable for BeginningEnd
+ /// 00100 -> nullable for NewLine
+ /// 01000 -> nullable for NewLineS
+ /// 10000 -> nullable for WordLetter
+ ///
+ ///
+ internal int NullabilityInfo { get; }
+
///
/// Builds a with the relevant flags set.
///
- /// a solver for
/// whether this state is an initial state
/// the flags for this matching state
- internal StateFlags BuildStateFlags(ISolver solver, bool isInitial)
+ internal StateFlags BuildStateFlags(bool isInitial)
{
StateFlags info = 0;
@@ -118,11 +135,6 @@ internal StateFlags BuildStateFlags(ISolver solver, bool isInitial)
info |= StateFlags.IsInitialFlag;
}
- if (IsDeadend(solver))
- {
- info |= StateFlags.IsDeadendFlag;
- }
-
if (Node.CanBeNullable)
{
info |= StateFlags.CanBeNullableFlag;
@@ -140,6 +152,22 @@ internal StateFlags BuildStateFlags(ISolver solver, bool isInitial)
return info;
}
+ /// Builds the nullability information for the matching state.
+ /// Nullability for each context is encoded in a bit. See .
+ private byte BuildNullabilityInfo()
+ {
+ byte nullabilityInfo = 0;
+ if (Node.CanBeNullable)
+ {
+ for (uint charKind = 0; charKind < CharKind.CharKindCount; charKind++)
+ {
+ nullabilityInfo |= (byte)(Node.IsNullableFor(CharKind.Context(PrevCharKind, charKind)) ? 1 << (int)charKind : 0);
+ }
+ }
+
+ return nullabilityInfo;
+ }
+
public override bool Equals(object? obj) =>
obj is MatchingState s && PrevCharKind == s.PrevCharKind && Node.Equals(s.Node);
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
index d00fcc0d62ff40..24d2a26f849229 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
@@ -1,7 +1,9 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
+using System.Buffers;
using System.Diagnostics;
+using System.Numerics;
using System.Runtime.CompilerServices;
namespace System.Text.RegularExpressions.Symbolic
@@ -20,81 +22,104 @@ namespace System.Text.RegularExpressions.Symbolic
///
internal sealed class MintermClassifier
{
- /// An array used when there's a single minterm, in order to map every ASCII character to it trivially.
- private static readonly int[] AllAsciiIsZeroMintermArray = new int[128];
+ /// Mapping for characters to minterms, used in the vast majority case when there are less than 256 minterms.
+ /// _lookup[char] provides the minterm ID. If char >= _lookup.Length, its minterm is 0.
+ private readonly byte[]? _lookup;
- /// Array providing fast mapping from an ASCII character (the array index) to its corresponding minterm ID.
- private readonly int[] _ascii;
- /// A multi-terminal BDD for mapping any non-ASCII character to its associated minterm ID.
- ///
- /// The use of a multi-terminal BDD here is an implementation detail. Should we decide its important to optimize non-ASCII inputs further,
- /// or to consolidate the mechanism with the other engines, an alternatie lookup algorithm / data structure could be employed.
- ///
- private readonly BDD _nonAscii;
+ /// Mapping for characters to minterms, used when there are at least 256 minterms. This is rarely used.
+ /// _intLookup[char] provides the minterm ID. If char >= _intLookup.Length, its minterm is 0.
+ private readonly int[]? _intLookup;
/// Create a classifier that maps a character to the ID of its associated minterm.
/// A BDD for classifying all characters (ASCII and non-ASCII) to their corresponding minterm IDs.
- /// The character set solver to use.
- public MintermClassifier(BDD[] minterms, CharSetSolver solver)
+ public MintermClassifier(BDD[] minterms)
{
Debug.Assert(minterms.Length > 0, "Requires at least");
if (minterms.Length == 1)
{
// With only a single minterm, the mapping is trivial: everything maps to it (ID 0).
- // For ASCII, use an array containing all zeros. For non-ASCII, use a BDD that maps everything to 0.
- _ascii = AllAsciiIsZeroMintermArray;
- _nonAscii = solver.ReplaceTrue(BDD.True, 0);
+ _lookup = [];
return;
}
- // Create a multi-terminal BDD for mapping any character to its associated minterm.
- BDD anyCharacterToMintermId = BDD.False;
- for (int i = 0; i < minterms.Length; i++)
- {
- // Each supplied minterm BDD decides whether a given character maps to it or not.
- // We need to combine all of those into a multi-terminal BDD that decides which
- // minterm a character maps to. To do that, we take each minterm BDD and replace
- // its True result with the ID of the minterm, such that a character that would
- // have returned True for that BDD now returns the minterm ID.
- BDD charToTargetMintermId = solver.ReplaceTrue(minterms[i], i);
+ // Compute all minterm ranges. We do this here in order to determine the maximum character value
+ // in order to size the lookup array to minimize steady-state memory consumption of the potentially
+ // large lookup array. We prefer to use the byte[] _lookup when possible, in order to keep memory
+ // consumption to a minimum; doing so accomodates up to 255 minterms, which is the vast majority case.
+ // However, when there are more than 255 minterms, we need to use int[] _intLookup.
+ (uint, uint)[][] charRangesPerMinterm = ArrayPool<(uint, uint)[]>.Shared.Rent(minterms.Length);
- // Now union this BDD with the multi-terminal BDD we've built up thus far. Unioning
- // is valid because every character belongs to exactly one minterm and thus will
- // only map to an ID instead of False in exactly one of the input BDDs.
- anyCharacterToMintermId = solver.Or(anyCharacterToMintermId, charToTargetMintermId);
+ int maxChar = -1;
+ for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
+ {
+ (uint, uint)[] ranges = BDDRangeConverter.ToRanges(minterms[mintermId]);
+ charRangesPerMinterm[mintermId] = ranges;
+ maxChar = Math.Max(maxChar, (int)ranges[^1].Item2);
}
- // Now that we have our mapping that supports any input character, we want to optimize for
- // ASCII inputs. Rather than forcing every input ASCII character to consult the BDD at match
- // time, we precompute a lookup table, where each ASCII character can be used to index into the
- // array to determine the ID for its corresponding minterm.
- var ascii = new int[128];
- for (int i = 0; i < ascii.Length; i++)
+ // It's incredibly rare for a regex to use more than a couple hundred minterms,
+ // but we need a fallback just in case. (Over 128 unique sets also means it's never ASCII only.)
+ if (minterms.Length > 255)
+ {
+ _intLookup = CreateLookup(minterms, charRangesPerMinterm, maxChar);
+ }
+ else
{
- ascii[i] = anyCharacterToMintermId.Find(i);
+ _lookup = CreateLookup(minterms, charRangesPerMinterm, maxChar);
}
- _ascii = ascii;
- // We can also further optimize the BDD in two ways:
- // 1. We can now remove the ASCII characters from it, as we'll always consult the lookup table first
- // for ASCII inputs and thus will never use the BDD for them. While optional (skipping this step will not
- // affect correctness), removing the ASCII values from the BDD reduces the size of the multi-terminal BDD.
- // 2. We can check if every character now maps to the same minterm ID (the same terminal in the
- // multi-terminal BDD). This can be relatively common after (1) above is applied, as many
- // patterns don't distinguish between any non-ASCII characters (e.g. "[0-9]*"). If every character
- // in the BDD now maps to the same minterm, we can replace the BDD with a much simpler/faster/smaller one.
- BDD nonAsciiBDD = solver.And(anyCharacterToMintermId, solver.NonAscii);
- nonAsciiBDD = nonAsciiBDD.IsEssentiallyBoolean(out BDD? singleTerminalBDD) ? singleTerminalBDD : nonAsciiBDD;
- _nonAscii = nonAsciiBDD;
+ // Return the rented array. We clear it before returning it in order to avoid all the ranges arrays being kept alive.
+ Array.Clear(charRangesPerMinterm, 0, minterms.Length);
+ ArrayPool<(uint, uint)[]>.Shared.Return(charRangesPerMinterm);
+
+ // Creates the lookup array.
+ static T[] CreateLookup(BDD[] minterms, ReadOnlySpan<(uint, uint)[]> charRangesPerMinterm, int _maxChar) where T : IBinaryInteger
+ {
+ T[] lookup = new T[_maxChar + 1];
+ for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
+ {
+ // Each minterm maps to a range of characters. Set each of the characters in those ranges to the corresponding minterm.
+ foreach ((uint start, uint end) in charRangesPerMinterm[mintermId])
+ {
+ lookup.AsSpan((int)start, (int)(end + 1 - start)).Fill(T.CreateTruncating(mintermId));
+ }
+ }
+
+ return lookup;
+ }
}
- /// Gets the ID of the minterm associated with the specified character.
+ /// Gets the ID of the minterm associated with the specified character.
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public int GetMintermID(int c)
{
- int[] ascii = _ascii;
- return (uint)c < (uint)ascii.Length ? ascii[c] : _nonAscii.Find(c);
+ if (_lookup is not null)
+ {
+ byte[] lookup = _lookup;
+ return (uint)c < (uint)lookup.Length ? lookup[c] : 0;
+ }
+ else
+ {
+ int[] lookup = _intLookup!;
+ return (uint)c < (uint)lookup.Length ? lookup[c] : 0;
+ }
}
+ ///
+ /// Gets a quick mapping from char to minterm for the common case when there are <= 255 minterms.
+ /// Null if there are greater than 255 minterms.
+ ///
+ public byte[]? ByteLookup => _lookup;
+
+ ///
+ /// Gets a mapping from char to minterm for the rare case when there are >= 255 minterms.
+ /// Null in the common case where there are fewer than 255 minterms.
+ ///
+ public int[]? IntLookup => _intLookup;
+
+ ///
+ /// Maximum ordinal character for a non-0 minterm, used to conserve memory
+ ///
+ public int MaxChar => (_lookup?.Length ?? _intLookup!.Length) - 1;
}
}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs
index 5a620f3771be6f..b446fecdca28f5 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs
@@ -6,18 +6,18 @@ namespace System.Text.RegularExpressions.Symbolic
///
/// These flags provide context-independent information available for every state. They provide a fast way to evaluate
/// conditions in the inner matching loops of . The matcher caches one of these
- /// for every state, for which they are created by .
+ /// for every state, for which they are created by .
/// In DFA mode the cached flags are used directly, while in NFA mode the
/// handles aggregating the flags in the state set.
///
[Flags]
internal enum StateFlags : byte
{
+ None = 0,
IsInitialFlag = 1,
- IsDeadendFlag = 2,
- IsNullableFlag = 4,
- CanBeNullableFlag = 8,
- SimulatesBacktrackingFlag = 16,
+ IsNullableFlag = 2,
+ CanBeNullableFlag = 4,
+ SimulatesBacktrackingFlag = 8,
}
///
@@ -25,10 +25,9 @@ internal enum StateFlags : byte
///
internal static class StateFlagsExtensions
{
- internal static bool IsInitial(this StateFlags info) => (info & StateFlags.IsInitialFlag) != 0;
- internal static bool IsDeadend(this StateFlags info) => (info & StateFlags.IsDeadendFlag) != 0;
- internal static bool IsNullable(this StateFlags info) => (info & StateFlags.IsNullableFlag) != 0;
- internal static bool CanBeNullable(this StateFlags info) => (info & StateFlags.CanBeNullableFlag) != 0;
- internal static bool SimulatesBacktracking(this StateFlags info) => (info & StateFlags.SimulatesBacktrackingFlag) != 0;
+ internal static bool IsInitial(this StateFlags info) => (info & StateFlags.IsInitialFlag) != StateFlags.None;
+ internal static bool IsNullable(this StateFlags info) => (info & StateFlags.IsNullableFlag) != StateFlags.None;
+ internal static bool CanBeNullable(this StateFlags info) => (info & StateFlags.CanBeNullableFlag) != StateFlags.None;
+ internal static bool SimulatesBacktracking(this StateFlags info) => (info & StateFlags.SimulatesBacktrackingFlag) != StateFlags.None;
}
}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs
index ff95195292bfa4..b0aa0cd6e938de 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs
@@ -17,6 +17,7 @@ namespace System.Text.RegularExpressions.Symbolic
private const uint IsHighPriorityNullableMask = 64;
private const uint ContainsEffectMask = 128;
private const uint ContainsLineAnchorMask = 256;
+ private const uint ContainsEndZAnchorMask = 512;
private readonly uint _info;
@@ -26,7 +27,7 @@ private static SymbolicRegexInfo Create(
bool isAlwaysNullable = false, bool canBeNullable = false,
bool startsWithLineAnchor = false, bool containsLineAnchor = false,
bool startsWithSomeAnchor = false, bool containsSomeAnchor = false,
- bool isHighPriorityNullable = false, bool containsEffect = false)
+ bool isHighPriorityNullable = false, bool containsEffect = false, bool containsEndZAnchor = false)
{
// Assert that the expected implications hold. For example, every node that contains a line anchor
// must also be marked as containing some anchor.
@@ -43,7 +44,8 @@ private static SymbolicRegexInfo Create(
(startsWithSomeAnchor ? StartsWithSomeAnchorMask : 0) |
(containsSomeAnchor ? ContainsSomeAnchorMask : 0) |
(isHighPriorityNullable ? IsHighPriorityNullableMask : 0) |
- (containsEffect ? ContainsEffectMask : 0));
+ (containsEffect ? ContainsEffectMask : 0) |
+ (containsEndZAnchor ? ContainsEndZAnchorMask : 0));
}
public bool IsNullable => (_info & IsAlwaysNullableMask) != 0;
@@ -63,6 +65,7 @@ private static SymbolicRegexInfo Create(
public bool IsHighPriorityNullable => (_info & IsHighPriorityNullableMask) != 0;
public bool ContainsEffect => (_info & ContainsEffectMask) != 0;
+ public bool ContainsEndZAnchor => (_info & ContainsEndZAnchorMask) != 0;
///
/// Used for any node that acts as an epsilon, i.e., something that always matches the empty string.
@@ -77,13 +80,15 @@ public static SymbolicRegexInfo Epsilon() =>
/// Used for all anchors.
///
/// whether this anchor is a line anchor
- public static SymbolicRegexInfo Anchor(bool isLineAnchor) =>
+ /// whether this anchor is an end Z anchor
+ public static SymbolicRegexInfo Anchor(bool isLineAnchor, bool isEndZAnchor) =>
Create(
canBeNullable: true,
startsWithLineAnchor: isLineAnchor,
containsLineAnchor: isLineAnchor,
startsWithSomeAnchor: true,
- containsSomeAnchor: true);
+ containsSomeAnchor: true,
+ containsEndZAnchor: isEndZAnchor);
///
/// The alternation remains high priority nullable if the left alternative is so.
@@ -99,7 +104,8 @@ public static SymbolicRegexInfo Alternate(SymbolicRegexInfo left_info, SymbolicR
startsWithSomeAnchor: left_info.StartsWithSomeAnchor || right_info.StartsWithSomeAnchor,
containsSomeAnchor: left_info.ContainsSomeAnchor || right_info.ContainsSomeAnchor,
isHighPriorityNullable: left_info.IsHighPriorityNullable,
- containsEffect: left_info.ContainsEffect || right_info.ContainsEffect);
+ containsEffect: left_info.ContainsEffect || right_info.ContainsEffect,
+ containsEndZAnchor: left_info.ContainsEndZAnchor || right_info.ContainsEndZAnchor);
///
/// Concatenation remains high priority nullable if both left and right are so.
@@ -115,7 +121,9 @@ public static SymbolicRegexInfo Concat(SymbolicRegexInfo left_info, SymbolicRege
startsWithSomeAnchor: left_info.StartsWithSomeAnchor || (left_info.CanBeNullable && right_info.StartsWithSomeAnchor),
containsSomeAnchor: left_info.ContainsSomeAnchor || right_info.ContainsSomeAnchor,
isHighPriorityNullable: left_info.IsHighPriorityNullable && right_info.IsHighPriorityNullable,
- containsEffect: left_info.ContainsEffect || right_info.ContainsEffect);
+ containsEffect: left_info.ContainsEffect || right_info.ContainsEffect,
+ containsEndZAnchor: left_info.ContainsEndZAnchor || right_info.ContainsEndZAnchor
+ );
///
/// Inherits anchor visibility from the loop body.
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
index 278b69fe391fef..327f5666f9e2a5 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs
@@ -4,6 +4,7 @@
using System.Collections.Generic;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
+using System.Runtime.CompilerServices;
using System.Threading;
namespace System.Text.RegularExpressions.Symbolic
@@ -25,7 +26,7 @@ internal sealed partial class SymbolicRegexMatcher
/// Cache for the states that have been created. Each state is uniquely identified by its associated
/// and the kind of the previous character.
///
- private readonly Dictionary<(SymbolicRegexNode Node, uint PrevCharKind), MatchingState> _stateCache = new();
+ private readonly Dictionary<(SymbolicRegexNode Node, uint PrevCharKind), MatchingState> _stateCache = [];
///
/// Maps state ids to states, initial capacity is given by .
@@ -40,6 +41,14 @@ internal sealed partial class SymbolicRegexMatcher
///
private StateFlags[] _stateFlagsArray;
+ /// Cached nullability info for each state ID.
+ ///
+ /// _nullabilityArray[stateId] == the for that state.
+ /// Used to short-circuit nullability in the hot loop.
+ /// Important: the pattern must not contain endZ for this to be valid.
+ ///
+ private byte[] _nullabilityArray;
+
///
/// The transition function for DFA mode.
/// Each state has a range of consecutive entries for each minterm ID. A range of size 2^L, where L is
@@ -69,7 +78,7 @@ internal sealed partial class SymbolicRegexMatcher
/// It is the inverse of used entries in _nfaStateArray.
/// The range of this map is 0 to its size - 1.
///
- private readonly Dictionary _nfaIdByCoreId = new();
+ private readonly Dictionary _nfaIdByCoreId = [];
///
/// Transition function for NFA transitions in NFA mode.
@@ -107,6 +116,13 @@ private static void ArrayResizeAndVolatilePublish(ref T[] array, int newSize)
private int DeltaOffset(int stateId, int mintermId) => (stateId << _mintermsLog) | mintermId;
+ ///
+ /// Pre-computed hot-loop version of nullability check
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private bool IsNullableWithContext(int stateId, int mintermId) =>
+ (_nullabilityArray[stateId] & (1 << (int)GetPositionKind(mintermId))) > 0;
+
/// Returns the span from that may contain transitions for the given state
private Span GetDeltasFor(MatchingState state)
{
@@ -152,6 +168,78 @@ private MatchingState GetOrCreateState(SymbolicRegexNode node, uint
return GetOrCreateState_NoLock(node, prevCharKind);
}
+ ///
+ /// Analyze the specified reversed pattern to gather details that help to optimize the reverse matching process
+ /// for when finding the beginning of a match.
+ ///
+ ///
+ /// Optimized reversal state computation during construction which skips the fixed length suffix, e.g. for the pattern abc.*def
+ /// 1) the end is found at abc.*def|
+ /// 2) the reversal starts at abc.*|
+ ///
+ /// Reversed initial pattern
+ /// The match reversal details.
+ private MatchReversalInfo CreateOptimizedReversal(SymbolicRegexNode node)
+ {
+ int pos = 0;
+ while (true)
+ {
+ if (node._info.ContainsSomeAnchor)
+ {
+ // Bail if it contains any anchors as it invalidates the optimization.
+ // (This could potentially be a very good future optimization for anchors but there's too many edge cases to guarantee it works.
+ // One example which fails currently: pattern: @"\By\b", input: "xy")
+ pos = 0;
+ break;
+ }
+
+ if (node._kind is not SymbolicRegexNodeKind.Concat)
+ {
+ if (node._kind is SymbolicRegexNodeKind.CaptureStart)
+ {
+ node = _builder.Epsilon; // The entire match is fixed length.
+ }
+ break;
+ }
+
+ SymbolicRegexNode? left = node._left;
+ Debug.Assert(left is not null);
+
+ if (left._kind is SymbolicRegexNodeKind.CaptureEnd or SymbolicRegexNodeKind.BoundaryAnchor or SymbolicRegexNodeKind.Singleton)
+ {
+ node = node._right!;
+ if (left._kind is SymbolicRegexNodeKind.Singleton)
+ {
+ pos++;
+ }
+ }
+ else if (left._kind is SymbolicRegexNodeKind.Loop)
+ {
+ if (left._lower <= 0 || left._left!.Kind is not SymbolicRegexNodeKind.Singleton)
+ {
+ break;
+ }
+
+ node = left._lower == left._upper ?
+ node._right! : // The entire loop is fixed
+ _builder.CreateConcat( // Subtract the fixed part of the loop.
+ _builder.CreateLoop(left._left, left.IsLazy, 0, left._upper - left._lower),
+ node._right!);
+ pos += left._lower;
+ }
+ else
+ {
+ break;
+ }
+ }
+
+ Debug.Assert(pos >= 0);
+ return
+ pos == 0 ? new MatchReversalInfo(MatchReversalKind.MatchStart, 0) :
+ node == _builder.Epsilon ? new MatchReversalInfo(MatchReversalKind.FixedLength, pos) :
+ new MatchReversalInfo(MatchReversalKind.PartialFixedLength, pos, GetOrCreateState_NoLock(_builder.CreateDisableBacktrackingSimulation(node), 0));
+ }
+
///
/// Create a state with given node and previous character context.
///
@@ -178,9 +266,11 @@ private MatchingState GetOrCreateState_NoLock(SymbolicRegexNode node
ArrayResizeAndVolatilePublish(ref _stateArray, newsize);
ArrayResizeAndVolatilePublish(ref _dfaDelta, newsize << _mintermsLog);
ArrayResizeAndVolatilePublish(ref _stateFlagsArray, newsize);
+ ArrayResizeAndVolatilePublish(ref _nullabilityArray, newsize);
}
_stateArray[state.Id] = state;
- _stateFlagsArray[state.Id] = state.BuildStateFlags(Solver, isInitialState);
+ _stateFlagsArray[state.Id] = state.BuildStateFlags(isInitialState);
+ _nullabilityArray[state.Id] = (byte)state.NullabilityInfo;
}
return state;
@@ -266,17 +356,18 @@ private int GetCoreStateId(int nfaStateId)
/// Gets or creates a new DFA transition.
/// This function locks the matcher for safe concurrent use of the
private bool TryCreateNewTransition(
- MatchingState sourceState, int mintermId, int offset, bool checkThreshold, [NotNullWhen(true)] out MatchingState? nextState)
+ MatchingState sourceState, int mintermId, int offset, bool checkThreshold, [NotNullWhen(true)] out MatchingState? nextState,
+ long timeoutOccursAt = 0)
{
Debug.Assert(offset < _dfaDelta.Length);
-
lock (this)
{
// check if meanwhile delta[offset] has become defined possibly by another thread
MatchingState? targetState = _stateArray[_dfaDelta[offset]];
if (targetState is null)
{
- if (checkThreshold && _stateCache.Count >= SymbolicRegexThresholds.NfaThreshold)
+ if ((timeoutOccursAt != 0 && Environment.TickCount64 > timeoutOccursAt) || // if there's an active timer
+ (checkThreshold && _builder._nodeCache.Count >= SymbolicRegexThresholds.NfaNodeCountThreshold)) // if # of nodes exceeds the NFA threshold
{
nextState = null;
return false;
@@ -312,10 +403,10 @@ private int[] CreateNewNfaTransition(int nfaStateId, int mintermId, int nfaOffse
MatchingState coreState = GetState(coreId);
TSet minterm = GetMintermFromId(mintermId);
uint nextCharKind = GetPositionKind(mintermId);
- SymbolicRegexNode? targetNode = coreTargetId > 0 ?
+ SymbolicRegexNode targetNode = coreTargetId > 0 ?
GetState(coreTargetId).Node : coreState.Next(_builder, minterm, nextCharKind);
- List targetsList = new();
+ List targetsList = [];
ForEachNfaState(targetNode, nextCharKind, targetsList, static (int nfaId, List targetsList) =>
targetsList.Add(nfaId));
@@ -342,8 +433,9 @@ private int[] CreateNewNfaTransition(int nfaStateId, int mintermId, int nfaOffse
TSet minterm = GetMintermFromId(mintermId);
uint nextCharKind = GetPositionKind(mintermId);
List<(SymbolicRegexNode Node, DerivativeEffect[] Effects)>? transition = coreState.NfaNextWithEffects(_builder, minterm, nextCharKind);
+
// Build the new state and store it into the array.
- List<(int, DerivativeEffect[])> targetsList = new();
+ List<(int, DerivativeEffect[])> targetsList = [];
foreach ((SymbolicRegexNode Node, DerivativeEffect[] Effects) entry in transition)
{
ForEachNfaState(entry.Node, nextCharKind, (targetsList, entry.Effects),
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index 4394329f8eae21..08f423b03344ad 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -81,6 +81,20 @@ internal sealed partial class SymbolicRegexMatcher : SymbolicRegexMatcher
/// Data and routines for skipping ahead to the next place a match could potentially start.
private readonly RegexFindOptimizations? _findOpts;
+ ///
+ /// Dead end state to quickly return NoMatch.
+ ///
+ private readonly int _deadStateId;
+
+ /// Initial state used for vectorization.
+ private readonly int _initialStateId;
+
+ /// Whether the pattern contains any anchor.
+ private readonly bool _containsAnyAnchor;
+
+ /// Whether the pattern contains the EndZ anchor, which invalidates most optimization shortcuts.
+ private readonly bool _containsEndZAnchor;
+
/// The initial states for the original pattern, keyed off of the previous character kind.
/// If the pattern doesn't contain any anchors, there will only be a single initial state.
private readonly MatchingState[] _initialStates;
@@ -93,6 +107,9 @@ internal sealed partial class SymbolicRegexMatcher : SymbolicRegexMatcher
/// If the pattern doesn't contain any anchors, there will only be a single initial state.
private readonly MatchingState[] _reverseInitialStates;
+ /// Details on optimized processing of the reverse of the pattern to find the beginning of a match.
+ private readonly MatchReversalInfo _optimizedReversalInfo;
+
/// Partition of the input space of sets.
private readonly TSet[] _minterms;
@@ -169,9 +186,10 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder builder, SymbolicRegexNo
((BitVectorSolver)(object)builder._solver)._classifier;
_capsize = captureCount;
- // Initialization for fields in SymbolicRegexMatcher.Automata.cs
+ // Initialize state and nullability arrays.
_stateArray = new MatchingState[InitialDfaStateCapacity];
_stateFlagsArray = new StateFlags[InitialDfaStateCapacity];
+ _nullabilityArray = new byte[InitialDfaStateCapacity];
_dfaDelta = new int[InitialDfaStateCapacity << _mintermsLog];
// Initialize a lookup array for the character kinds of each minterm ID. This includes one "special" minterm
@@ -183,6 +201,9 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder builder, SymbolicRegexNo
_positionKinds[mintermId + 1] = CalculateMintermIdKind(mintermId);
}
+ // Gather optimized reversal processing information.
+ _optimizedReversalInfo = CreateOptimizedReversal(_pattern.Reverse(builder));
+
// Store the find optimizations that can be used to jump ahead to the next possible starting location.
// If there's a leading beginning anchor, the find optimizations are unnecessary on top of the DFA's
// handling for beginning anchors.
@@ -199,6 +220,10 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder builder, SymbolicRegexNo
// The loops below and how character kinds are calculated assume that the "general" character kind is zero
Debug.Assert(CharKind.General == 0);
+ // Assign edge case info for quick lookup
+ _containsAnyAnchor = _pattern._info.ContainsSomeAnchor;
+ _containsEndZAnchor = _pattern._info.ContainsEndZAnchor;
+
// Create the initial states for the original pattern.
var initialStates = new MatchingState[statesCount];
for (uint charKind = 0; charKind < initialStates.Length; charKind++)
@@ -221,6 +246,10 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder builder, SymbolicRegexNo
}
_dotstarredInitialStates = dotstarredInitialStates;
+ // Assign dead and initial state ids
+ _deadStateId = GetOrCreateState_NoLock(_builder._nothing, 0).Id;
+ _initialStateId = _dotstarredInitialStates[CharKind.General].Id;
+
// Create the reverse pattern (the original pattern in reverse order) and all of its
// initial states. Also disable backtracking simulation to ensure the reverse path from
// the final state that was found is followed. Not doing so might cause the earliest
@@ -342,18 +371,27 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i
// As an example, consider the pattern a{1,3}(b*) run against an input of aacaaaabbbc: phase 1 will find
// the position of the last b: aacaaaabbbc. It additionally records the position of the first a after
// the c as the low boundary for the starting position.
- int matchStartLowBoundary, matchStartLengthMarker;
- int matchEnd = (_pattern._info.ContainsLineAnchor, _findOpts is not null, _pattern._info.ContainsSomeAnchor) switch
+
+ // The Z anchor and over 255 minterms are rare enough to consider them separate edge cases.
+ int matchEnd;
+ if (!_containsEndZAnchor && _mintermClassifier.IntLookup is null)
{
- (true, true, true) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData),
- (true, true, false) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData),
- (true, false, true) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData),
- (true, false, false) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData),
- (false, true, true) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData),
- (false, true, false) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData),
- (false, false, true) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData),
- (false, false, false) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData),
- };
+ // Optimize processing for the common case of no Z anchor and <= 255 minterms. Specialize each call with different generic method arguments.
+ matchEnd = (_findOpts is not null, _containsAnyAnchor) switch
+ {
+ (true, true) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData),
+ (true, false) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData),
+ (false, false) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData),
+ (false, true) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData),
+ };
+ }
+ else
+ {
+ // Fallback for Z anchor or over 255 minterms
+ matchEnd = _findOpts is not null ?
+ FindEndPositionFallback(input, startat, timeoutOccursAt, mode, perThreadData) :
+ FindEndPositionFallback(input, startat, timeoutOccursAt, mode, perThreadData);
+ }
// If there wasn't a match, we're done.
if (matchEnd == NoMatchExists)
@@ -374,22 +412,57 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i
// recorded a fixed-length marker for the portion of the pattern that matched, as we can then jump that
// exact number of positions backwards. Continuing the previous example, phase 2 will walk backwards from
// that last b until it finds the 4th a: aaabbbc.
- int matchStart;
- if (matchStartLengthMarker >= 0)
- {
- matchStart = matchEnd - matchStartLengthMarker;
- }
- else
+ int matchStart = 0;
+ Debug.Assert(matchEnd >= startat - 1);
+ switch (_optimizedReversalInfo.Kind)
{
- Debug.Assert(matchEnd >= startat - 1);
- matchStart = matchEnd < startat ?
- startat : (_pattern._info.ContainsLineAnchor, _pattern._info.ContainsSomeAnchor) switch
+ case MatchReversalKind.MatchStart:
+ case MatchReversalKind.PartialFixedLength:
+ int initialLastStart = -1; // invalid sentinel value
+ int i = matchEnd;
+ CurrentState reversalStartState;
+
+ if (_optimizedReversalInfo.Kind is MatchReversalKind.MatchStart)
+ {
+ // No fixed-length knowledge. Start at the end of the match.
+ reversalStartState = new CurrentState(_reverseInitialStates[GetCharKind(input, matchEnd)]);
+ }
+ else
+ {
+ // There's a fixed-length portion at the end of the match. Start just before it.
+ i -= _optimizedReversalInfo.FixedLength;
+ reversalStartState = new CurrentState(_optimizedReversalInfo.AdjustedStartState!);
+
+ // reversal may already be nullable here in the case of anchors
+ if (_containsAnyAnchor &&
+ _nullabilityArray[reversalStartState.DfaStateId] > 0 &&
+ FullNullabilityHandler.IsNullableAt(
+ this, in reversalStartState, FullInputReader.GetPositionId(this, input, i),
+ DfaStateHandler.GetStateFlags(this, in reversalStartState)))
+ {
+ initialLastStart = i;
+ }
+ }
+
+ matchStart = matchEnd < startat ? startat : (_containsEndZAnchor, _containsAnyAnchor) switch
{
- (true, true) => FindStartPosition(input, matchEnd, matchStartLowBoundary, perThreadData),
- (true, false) => FindStartPosition(input, matchEnd, matchStartLowBoundary, perThreadData),
- (false, true) => FindStartPosition(input, matchEnd, matchStartLowBoundary, perThreadData),
- (false, false) => FindStartPosition(input, matchEnd, matchStartLowBoundary, perThreadData),
+ // Call FindStartPosition with generic method arguments based on the presence of anchors. This is purely an optimization;
+ // the (true, true) case is functionally complete whereas the (false, false) case is the most optimized.
+ (true, true) => FindStartPosition(reversalStartState, initialLastStart, input, i, startat, perThreadData),
+ (true, false) => FindStartPosition(reversalStartState, initialLastStart, input, i, startat, perThreadData),
+ (false, true) => FindStartPosition(reversalStartState, initialLastStart, input, i, startat, perThreadData),
+ (false, false) => FindStartPosition(reversalStartState, initialLastStart, input, i, startat, perThreadData),
};
+ break;
+
+ case MatchReversalKind.FixedLength:
+ // The whole match is known to be of a fixed length, so we don't need to do any processing to find its beginning, just jump there.
+ matchStart = matchEnd - _optimizedReversalInfo.FixedLength;
+ break;
+
+ default:
+ Debug.Fail($"Unexpected reversal kind: {_optimizedReversalInfo.Kind}");
+ break;
}
// Phase 3:
@@ -403,30 +476,91 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i
}
else
{
- Registers endRegisters = _pattern._info.ContainsLineAnchor ?
+ Registers endRegisters = _containsAnyAnchor ?
FindSubcaptures(input, matchStart, matchEnd, perThreadData) :
FindSubcaptures(input, matchStart, matchEnd, perThreadData);
return new SymbolicMatch(matchStart, matchEnd - matchStart, endRegisters.CaptureStarts, endRegisters.CaptureEnds);
}
}
+ ///
+ /// Streamlined version of that doesn't handle /z anchors or very large sets of minterms.
+ ///
+ private int FindEndPositionOptimized(
+ ReadOnlySpan input, int pos, long timeoutOccursAt, RegexRunnerMode mode, PerThreadData perThreadData)
+ where TAcceleratedStateHandler : struct, IAcceleratedStateHandler
+ where TOptimizedNullabilityHandler : struct, IOptimizedNullabilityHandler
+ {
+ // Initial state candidate. (This is not used in the common DFA caseand could potentially be removed in the future.)
+ int initialStatePosCandidate = pos;
+ var currentState = new CurrentState(_dotstarredInitialStates[GetCharKind(input, pos - 1)]);
+ int endPos = NoMatchExists;
+ int lengthMinus1 = input.Length - 1;
+
+ while (true)
+ {
+ int innerLoopLength;
+ bool done;
+ if (currentState.NfaState is null)
+ {
+ const int DfaCharsPerTimeoutCheck = 100_000;
+ innerLoopLength = _checkTimeout && lengthMinus1 - pos > DfaCharsPerTimeoutCheck ? pos + DfaCharsPerTimeoutCheck : lengthMinus1;
+ done = FindEndPositionDeltasDFAOptimized(
+ input, innerLoopLength, mode, timeoutOccursAt, ref pos,
+ ref currentState.DfaStateId, ref endPos);
+ }
+ else
+ {
+ // NFA fallback check, assume \Z and full nullability for NFA since it's already extremely rare to get here and it's not worth special-casing.
+ const int NfaCharsPerTimeoutCheck = 1_000;
+ innerLoopLength = _checkTimeout && input.Length - pos > NfaCharsPerTimeoutCheck ? pos + NfaCharsPerTimeoutCheck : input.Length;
+ done = FindEndPositionDeltasNFA(
+ input, innerLoopLength, mode, timeoutOccursAt, ref pos,
+ ref currentState, ref endPos, ref initialStatePosCandidate, ref initialStatePosCandidate);
+ }
+
+ // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or
+ // there is no more input available, then the whole search is done.
+ if (done || pos >= input.Length)
+ {
+ break;
+ }
+
+ // The search did not finish, so we either failed to transition (which should only happen if we were in DFA mode and
+ // need to switch over to NFA mode) or ran out of input in the inner loop. Check if the inner loop still had more
+ // input available.
+ if (pos < innerLoopLength)
+ {
+ // Because there was still more input available, a failure to transition in DFA mode must be the cause
+ // of the early exit. Upgrade to NFA mode.
+ NfaMatchingState nfaState = perThreadData.NfaState;
+ nfaState.InitializeFrom(this, GetState(currentState.DfaStateId));
+ currentState = new CurrentState(nfaState);
+ }
+
+ // Check for a timeout before continuing.
+ if (_checkTimeout)
+ {
+ CheckTimeout(timeoutOccursAt);
+ }
+ }
+ return endPos;
+ }
+
/// Performs the initial Phase 1 match to find the end position of the match, or first final state if this is an isMatch call.
/// The input text.
/// The starting position in .
/// The time at which timeout occurs, if timeouts are being checked.
/// The mode of execution based on the regex operation being performed.
- /// The last position the initial state of was visited before the end position was found.
- /// Length of the match if there's a match; otherwise, -1.
/// Per thread data reused between calls.
///
/// A one-past-the-end index into input for the preferred match, or first final state position if isMatch is true, or NoMatchExists if no match exists.
///
- private int FindEndPosition(ReadOnlySpan input, int pos, long timeoutOccursAt, RegexRunnerMode mode, out int initialStatePos, out int matchLength, PerThreadData perThreadData)
+ private int FindEndPositionFallback(ReadOnlySpan input, int pos, long timeoutOccursAt, RegexRunnerMode mode, PerThreadData perThreadData)
where TInputReader : struct, IInputReader
where TFindOptimizationsHandler : struct, IInitialStateHandler
where TNullabilityHandler : struct, INullabilityHandler
{
- initialStatePos = pos;
int initialStatePosCandidate = pos;
var currentState = new CurrentState(_dotstarredInitialStates[GetCharKind(input, pos - 1)]);
@@ -442,14 +576,24 @@ private int FindEndPosition CharsPerTimeoutCheck ?
- pos + CharsPerTimeoutCheck :
- input.Length;
-
- bool done = currentState.NfaState is not null ?
- FindEndPositionDeltas(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) :
- FindEndPositionDeltas(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate);
+ // The fallback function has lower limits due to worse performance from edge cases
+ int innerLoopLength;
+ bool done;
+ if (currentState.NfaState is null)
+ {
+ const int DfaCharsPerTimeoutCheck = 25_000;
+ innerLoopLength = _checkTimeout && input.Length - pos > DfaCharsPerTimeoutCheck ? pos + DfaCharsPerTimeoutCheck : input.Length;
+ done = FindEndPositionDeltasDFA(
+ input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePosCandidate);
+ }
+ else
+ {
+ // NFA fallback check, assume \Z and full nullability for NFA since it's already extremely rare to get here.
+ const int NfaCharsPerTimeoutCheck = 1_000;
+ innerLoopLength = _checkTimeout && input.Length - pos > NfaCharsPerTimeoutCheck ? pos + NfaCharsPerTimeoutCheck : input.Length;
+ done = FindEndPositionDeltasNFA(
+ input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePosCandidate);
+ }
// If the inner loop indicates that the search finished (for example due to reaching a deadend state) or
// there is no more input available, then the whole search is done.
@@ -476,15 +620,119 @@ private int FindEndPosition 0 ? GetState(endStateId).FixedLength(GetCharKind(input, endPos)) : -1;
return endPos;
}
+
+ ///
+ /// This version of uses a different set of interfaces,
+ /// which don't check for many inner loop edge cases, e.g. input end or '\n'.
+ /// All edge cases are handled before entering the loop.
+ ///
+ private bool FindEndPositionDeltasDFAOptimized(
+ ReadOnlySpan input, int lengthMinus1, RegexRunnerMode mode,
+ long timeoutOccursAt, ref int posRef, ref int currentStateIdRef, ref int endPosRef)
+ where TAcceleratedStateHandler : struct, IAcceleratedStateHandler
+ where TOptimizedNullabilityHandler : struct, IOptimizedNullabilityHandler
+ {
+ // Initial check for input end lifted out of the subsequent hot-path loop.
+ if (posRef == input.Length)
+ {
+ if (_stateArray[currentStateIdRef]!.IsNullableFor(_positionKinds[0]))
+ {
+ // the end position kind was nullable
+ endPosRef = posRef;
+ }
+ return true;
+ }
+
+ // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning.
+ int pos = posRef;
+ int currStateId = currentStateIdRef;
+ int endPos = endPosRef;
+
+ byte[] mtlookup = _mintermClassifier.ByteLookup!;
+ int deadStateId = _deadStateId;
+ int initialStateId = _initialStateId;
+ try
+ {
+ // The goal is to make this loop as fast as it can possibly be,
+ // every single piece of overhead should be removed here
+ while (true)
+ {
+ if (currStateId == deadStateId)
+ {
+ return true;
+ }
+
+ if (TAcceleratedStateHandler.TryFindNextStartingPosition(this, input, mtlookup, ref currStateId, ref pos, initialStateId))
+ {
+ if (pos == input.Length)
+ {
+ // patterns such as ^$ can be nullable right away
+ if (_stateArray[currStateId]!.IsNullableFor(_positionKinds[0]))
+ {
+ // the end position kind was nullable
+ endPos = pos;
+ }
+
+ currStateId = deadStateId;
+ continue;
+ }
+ }
+
+ // If the state is nullable for the next character, we found a potential end state.
+ if (TOptimizedNullabilityHandler.IsNullable(this, _nullabilityArray, currStateId, mtlookup, input, pos))
+ {
+ endPos = pos;
+
+ // A match is known to exist. If that's all we need to know, we're done.
+ if (mode == RegexRunnerMode.ExistenceRequired)
+ {
+ return true;
+ }
+ }
+
+ // If there is more input available try to transition with the next character.
+ // Note: the order here is important so the transition itself gets taken
+ if (!DfaStateHandler.TryTakeDFATransition(this, ref currStateId, GetMintermId(mtlookup, input, pos), timeoutOccursAt) ||
+ pos >= lengthMinus1)
+ {
+ if (pos + 1 < input.Length)
+ {
+ return false;
+ }
+ pos++;
+
+ // one off check for the final position
+ // this is just to move it out of the hot loop
+ if (!(_stateFlagsArray[currStateId].IsNullable() ||
+ _stateArray[currStateId]!.IsNullableFor(GetPositionKind(-1))))
+ {
+ return true;
+
+ }
+
+ // the end position (-1) was nullable
+ endPos = pos;
+ return true;
+ }
+
+ // We successfully transitioned, so update our current input index to match.
+ pos++;
+ }
+ }
+ finally
+ {
+ // Write back the local copies of the ref values.
+ posRef = pos;
+ endPosRef = endPos;
+ currentStateIdRef = currStateId;
+ }
+ }
+
///
- /// Workhorse inner loop for . Consumes the character by character,
+ /// Workhorse inner loop for . Consumes the character by character,
/// starting at , for each character transitioning from one state in the DFA or NFA graph to the next state,
/// lazily building out the graph as needed.
///
@@ -500,8 +748,8 @@ private int FindEndPosition
- private bool FindEndPositionDeltas(ReadOnlySpan input, int length, RegexRunnerMode mode,
- ref int posRef, ref CurrentState state, ref int endPosRef, ref int endStateIdRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef)
+ private bool FindEndPositionDeltasDFA(ReadOnlySpan input, int length, RegexRunnerMode mode,
+ long timeoutOccursAt, ref int posRef, ref CurrentState state, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef)
where TStateHandler : struct, IStateHandler
where TInputReader : struct, IInputReader
where TFindOptimizationsHandler : struct, IInitialStateHandler
@@ -510,30 +758,108 @@ private bool FindEndPositionDeltas(this, input, ref state, ref pos))
{
return true;
}
-
initialStatePosCandidate = pos;
}
- // If the state is a dead end, such that we can't transition anywhere else, end the search.
- if (flags.IsDeadend())
+ int positionId = TInputReader.GetPositionId(this, input, pos);
+
+ // If the state is nullable for the next character, meaning it accepts the empty string,
+ // we found a potential end state.
+ if (TNullabilityHandler.IsNullableAt(this, in state, positionId, TStateHandler.GetStateFlags(this, in state)))
+ {
+ endPos = pos;
+
+ // endStateId = TStateHandler.ExtractNullableCoreStateId(this, in state, input, pos);
+ initialStatePos = initialStatePosCandidate;
+
+ // A match is known to exist. If that's all we need to know, we're done.
+ if (mode == RegexRunnerMode.ExistenceRequired)
+ {
+ return true;
+ }
+ }
+
+ // If there is more input available try to transition with the next character.
+ if (pos >= length || !TStateHandler.TryTakeTransition(this, ref state, positionId, timeoutOccursAt))
+ {
+ return false;
+ }
+
+ // We successfully transitioned, so update our current input index to match.
+ pos++;
+ }
+ }
+ finally
+ {
+ // Write back the local copies of the ref values.
+ posRef = pos;
+ endPosRef = endPos;
+ initialStatePosRef = initialStatePos;
+ initialStatePosCandidateRef = initialStatePosCandidate;
+ }
+ }
+
+ ///
+ /// Workhorse inner loop for . Consumes the character by character,
+ /// starting at , for each character transitioning from one state in the DFA or NFA graph to the next state,
+ /// lazily building out the graph as needed.
+ ///
+ ///
+ /// The supplies the actual transitioning logic, controlling whether processing is
+ /// performed in DFA mode or in NFA mode. However, it expects to be configured to match,
+ /// so for example if is a , it expects the 's
+ /// to be non-negative and its to be null; vice versa for
+ /// .
+ ///
+ ///
+ /// A positive value if iteration completed because it reached a deadend state or nullable state and the call is an isMatch.
+ /// 0 if iteration completed because we reached an initial state.
+ /// A negative value if iteration completed because we ran out of input or we failed to transition.
+ ///
+ private bool FindEndPositionDeltasNFA(
+ ReadOnlySpan input, int length, RegexRunnerMode mode, long timeoutOccursAt,
+ ref int posRef, ref CurrentState state, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef)
+ where TStateHandler : struct, IStateHandler
+ where TInputReader : struct, IInputReader
+ where TFindOptimizationsHandler : struct, IInitialStateHandler
+ where TNullabilityHandler : struct, INullabilityHandler
+ {
+ // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning.
+ int pos = posRef;
+ int endPos = endPosRef;
+ int initialStatePos = initialStatePosRef;
+ int initialStatePosCandidate = initialStatePosCandidateRef;
+ try
+ {
+ // Loop through each character in the input, transitioning from state to state for each.
+ while (true)
+ {
+ StateFlags flags = TStateHandler.GetStateFlags(this, in state);
+
+ // Dead end here means the set is empty
+ if (state.NfaState!.NfaStateSet.Count == 0)
{
return true;
}
@@ -545,7 +871,6 @@ private bool FindEndPositionDeltas(this, in state, positionId, flags))
{
endPos = pos;
- endStateId = TStateHandler.ExtractNullableCoreStateId(this, in state, input, pos);
initialStatePos = initialStatePosCandidate;
// A match is known to exist. If that's all we need to know, we're done.
@@ -556,7 +881,7 @@ private bool FindEndPositionDeltas= length || !TStateHandler.TryTakeTransition(this, ref state, positionId))
+ if (pos >= length || !TStateHandler.TryTakeTransition(this, ref state, positionId, timeoutOccursAt))
{
return false;
}
@@ -570,7 +895,6 @@ private bool FindEndPositionDeltas.
///
+ /// State to start reversal from
+ /// Either valid match start location or -1
/// The input text.
/// The ending position to walk backwards from. points one past the last character of the match.
/// The initial starting location discovered in phase 1, a point we must not walk earlier than.
/// Per thread data reused between calls.
/// The found starting position for the match.
- private int FindStartPosition(ReadOnlySpan input, int i, int matchStartBoundary, PerThreadData perThreadData)
+ private int FindStartPosition(CurrentState startState, int initialLastStart, ReadOnlySpan input, int i, int matchStartBoundary, PerThreadData perThreadData)
where TInputReader : struct, IInputReader
where TNullabilityHandler : struct, INullabilityHandler
{
Debug.Assert(i >= 0, $"{nameof(i)} == {i}");
Debug.Assert(matchStartBoundary >= 0 && matchStartBoundary <= input.Length, $"{nameof(matchStartBoundary)} == {matchStartBoundary}");
Debug.Assert(i >= matchStartBoundary, $"Expected {i} >= {matchStartBoundary}.");
-
- // Get the starting state for the reverse pattern. This depends on previous character (which, because we're
- // going backwards, is character number i).
- var currentState = new CurrentState(_reverseInitialStates[GetCharKind(input, i)]);
-
- int lastStart = -1; // invalid sentinel value
+ CurrentState currentState = startState;
+ int lastStart = initialLastStart;
// Walk backwards to the furthest accepting state of the reverse pattern but no earlier than matchStartBoundary.
while (true)
{
// Run the DFA or NFA traversal backwards from the current point using the current state.
bool done = currentState.NfaState is not null ?
- FindStartPositionDeltas(input, ref i, matchStartBoundary, ref currentState, ref lastStart) :
- FindStartPositionDeltas(input, ref i, matchStartBoundary, ref currentState, ref lastStart);
+ FindStartPositionDeltasNFA(input, ref i, matchStartBoundary, ref currentState, ref lastStart) :
+ FindStartPositionDeltasDFA(input, ref i, matchStartBoundary, ref currentState, ref lastStart);
// If we found the starting position, we're done.
if (done)
@@ -635,7 +957,8 @@ private int FindStartPosition(ReadOnlySpan, for each character transitioning from one state in the DFA or NFA graph to the next state,
/// lazily building out the graph as needed.
///
- private bool FindStartPositionDeltas(ReadOnlySpan input, ref int i, int startThreshold, ref CurrentState state, ref int lastStart)
+ private bool FindStartPositionDeltasDFA(
+ ReadOnlySpan input, ref int i, int startThreshold, ref CurrentState state, ref int lastStart)
where TStateHandler : struct, IStateHandler
where TInputReader : struct, IInputReader
where TNullabilityHandler : struct, INullabilityHandler
@@ -647,27 +970,73 @@ private bool FindStartPositionDeltas 0 &&
+ TNullabilityHandler.IsNullableAt(this, in state, positionId, TStateHandler.GetStateFlags(this, in state)))
+ {
+ lastStart = pos;
+ }
+ // If we are past the start threshold or if the state is a dead end, bail; we should have already
+ // found a valid starting location.
+ if (pos <= startThreshold || state.DfaStateId == _deadStateId)
+ {
+ Debug.Assert(lastStart != -1);
+ return true;
+ }
+
+ // Try to transition with the next character, the one before the current position.
+ if (!TStateHandler.TryTakeTransition(this, ref state, positionId, 0))
+ {
+ // Return false to indicate the search didn't finish.
+ return false;
+ }
+
+ // Since we successfully transitioned, update our current index to match the fact that we consumed the previous character in the input.
+ pos--;
+ }
+ }
+ finally
+ {
+ // Write back the local copies of the ref values.
+ i = pos;
+ }
+ }
+
+ private bool FindStartPositionDeltasNFA(ReadOnlySpan input, ref int i, int startThreshold, ref CurrentState state, ref int lastStart)
+ where TStateHandler : struct, IStateHandler
+ where TInputReader : struct, IInputReader
+ where TNullabilityHandler : struct, INullabilityHandler
+ {
+ // To avoid frequent reads/writes to ref values, make and operate on local copies, which we then copy back once before returning.
+ int pos = i;
+ try
+ {
+ // Loop backwards through each character in the input, transitioning from state to state for each.
+ while (true)
+ {
int positionId = TInputReader.GetPositionId(this, input, pos - 1);
// If the state accepts the empty string, we found a valid starting position. Record it and keep going,
// since we're looking for the earliest one to occur within bounds.
- if (TNullabilityHandler.IsNullableAt(this, in state, positionId, flags))
+ if (TNullabilityHandler.IsNullableAt(this, in state, positionId, TStateHandler.GetStateFlags(this, in state)))
{
lastStart = pos;
}
// If we are past the start threshold or if the state is a dead end, bail; we should have already
// found a valid starting location.
- if (pos <= startThreshold || flags.IsDeadend())
+ if (pos <= startThreshold || state.DfaStateId == _deadStateId)
{
Debug.Assert(lastStart != -1);
return true;
}
// Try to transition with the next character, the one before the current position.
- if (!TStateHandler.TryTakeTransition(this, ref state, positionId))
+ if (!TStateHandler.TryTakeTransition(this, ref state, positionId, 0))
{
// Return false to indicate the search didn't finish.
return false;
@@ -746,7 +1115,7 @@ private Registers FindSubcaptures(ReadOnlySpan input, int i,
int coreStateId = GetCoreStateId(targetStateId);
StateFlags flags = _stateFlagsArray[coreStateId];
- Debug.Assert(!flags.IsDeadend());
+ Debug.Assert(coreStateId != _deadStateId);
if (flags.IsNullable() || (flags.CanBeNullable() && GetState(coreStateId).IsNullableFor(GetCharKind(input, i + 1))))
{
@@ -768,7 +1137,7 @@ private Registers FindSubcaptures(ReadOnlySpan input, int i,
}
Debug.Assert(current.Count > 0);
- foreach (var (endStateId, endRegisters) in current.Values)
+ foreach ((int endStateId, Registers endRegisters) in current.Values)
{
MatchingState endState = GetState(GetCoreStateId(endStateId));
if (endState.IsNullableFor(GetCharKind(input, iEnd)))
@@ -784,6 +1153,16 @@ private Registers FindSubcaptures(ReadOnlySpan input, int i,
return default;
}
+ /// Look up the min term ID for the character at the specified position in the input.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static int GetMintermId(byte[] mintermLookup, ReadOnlySpan input, int pos)
+ {
+ Debug.Assert(pos >= 0 && pos < input.Length);
+
+ char c = input[pos];
+ return c < (uint)mintermLookup.Length ? mintermLookup[c] : 0;
+ }
+
/// Stores additional data for tracking capture start and end positions.
/// The NFA simulation based third phase has one of these for each current state in the current set of live states.
internal struct Registers(int[] captureStarts, int[] captureEnds)
@@ -938,7 +1317,7 @@ private interface IStateHandler
public static abstract bool IsNullableFor(SymbolicRegexMatcher matcher, in CurrentState state, uint nextCharKind);
public static abstract int ExtractNullableCoreStateId(SymbolicRegexMatcher matcher, in CurrentState state, ReadOnlySpan input, int pos);
public static abstract int FixedLength(SymbolicRegexMatcher matcher, in CurrentState state, uint nextCharKind);
- public static abstract bool TryTakeTransition(SymbolicRegexMatcher matcher, ref CurrentState state, int mintermId);
+ public static abstract bool TryTakeTransition(SymbolicRegexMatcher matcher, ref CurrentState state, int mintermId, long timeoutOccursAt);
public static abstract StateFlags GetStateFlags(SymbolicRegexMatcher matcher, in CurrentState state);
}
@@ -949,7 +1328,8 @@ private interface IStateHandler
public static bool StartsWithLineAnchor(SymbolicRegexMatcher matcher, in CurrentState state) => matcher.GetState(state.DfaStateId).StartsWithLineAnchor;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
- public static bool IsNullableFor(SymbolicRegexMatcher matcher, in CurrentState state, uint nextCharKind) => matcher.GetState(state.DfaStateId).IsNullableFor(nextCharKind);
+ public static bool IsNullableFor(SymbolicRegexMatcher matcher, in CurrentState state,
+ uint nextCharKind) => matcher._nullabilityArray[state.DfaStateId] > 0 && ((byte)(1 << (int)nextCharKind) & matcher._nullabilityArray[state.DfaStateId]) > 0;
/// Gets the preferred DFA state for nullability. In DFA mode this is just the state itself.
[MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -961,7 +1341,8 @@ private interface IStateHandler
/// Take the transition to the next DFA state.
[MethodImpl(MethodImplOptions.AggressiveInlining)]
- public static bool TryTakeTransition(SymbolicRegexMatcher matcher, ref CurrentState state, int mintermId)
+ public static bool TryTakeTransition(SymbolicRegexMatcher matcher, ref CurrentState state, int mintermId,
+ long timeoutOccursAt)
{
Debug.Assert(state.DfaStateId > 0, $"Expected non-zero {nameof(state.DfaStateId)}.");
Debug.Assert(state.NfaState is null, $"Expected null {nameof(state.NfaState)}.");
@@ -990,6 +1371,38 @@ public static bool TryTakeTransition(SymbolicRegexMatcher matcher, ref Cur
return false;
}
+ /// Transition function that only considers DFA state id
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal static bool TryTakeDFATransition(SymbolicRegexMatcher matcher, ref int state,
+ int mintermId, long timeoutOccursAt)
+ {
+ Debug.Assert(state > 0, $"Expected {nameof(state)} {state} > 0");
+
+ // Use the mintermId for the character being read to look up which state to transition to.
+ // If that state has already been materialized, move to it, and we're done. If that state
+ // hasn't been materialized, try to create it; if we can, move to it, and we're done.
+ int nextStateId = matcher._dfaDelta[matcher.DeltaOffset(state, mintermId)];
+ if (nextStateId > 0)
+ {
+ // There was an existing DFA transition to some state. Move to it and
+ // return that we're still operating as a DFA and can keep going.
+ state = nextStateId;
+ return true;
+ }
+
+ if (matcher.TryCreateNewTransition(matcher.GetState(state), mintermId,
+ matcher.DeltaOffset(state, mintermId),
+ checkThreshold: true, out MatchingState? nextState, timeoutOccursAt))
+ {
+ // We were able to create a new DFA transition to some state. Move to it and
+ // return that we're still operating as a DFA and can keep going.
+ state = nextState.Id;
+ return true;
+ }
+
+ return false;
+ }
+
///
/// Gets context independent state information:
/// - whether this is an initial state
@@ -998,8 +1411,8 @@ public static bool TryTakeTransition(SymbolicRegexMatcher matcher, ref Cur
/// - whether this state may be contextually nullable
///
[MethodImpl(MethodImplOptions.AggressiveInlining)]
- public static StateFlags GetStateFlags(SymbolicRegexMatcher matcher, in CurrentState state)
- => matcher._stateFlagsArray[state.DfaStateId];
+ public static StateFlags GetStateFlags(SymbolicRegexMatcher matcher, in CurrentState state) =>
+ matcher._stateFlagsArray[state.DfaStateId];
}
/// An for operating over instances configured as NFA states.
@@ -1067,7 +1480,8 @@ public static int FixedLength(SymbolicRegexMatcher matcher, in CurrentStat
}
/// Take the transition to the next NFA state.
- public static bool TryTakeTransition(SymbolicRegexMatcher matcher, ref CurrentState state, int mintermId)
+ public static bool TryTakeTransition(SymbolicRegexMatcher matcher, ref CurrentState state, int mintermId,
+ long timeoutOccursAt = 0)
{
Debug.Assert(state.DfaStateId < 0, $"Expected negative {nameof(state.DfaStateId)}.");
Debug.Assert(state.NfaState is not null, $"Expected non-null {nameof(state.NfaState)}.");
@@ -1149,25 +1563,17 @@ static int[] GetNextStates(int sourceState, int mintermId, SymbolicRegexMatcher<
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static StateFlags GetStateFlags(SymbolicRegexMatcher matcher, in CurrentState state)
{
- SparseIntMap stateSet = state.NfaState!.NfaStateSet;
- if (stateSet.Count == 0)
- {
- // In NFA state sets dead ends are never included. Instead an empty set of states represents a dead end.
- return StateFlags.IsDeadendFlag;
- }
- else
+ // Build the flags for the set of states by taking a bitwise Or of all the per-state flags and then
+ // masking out the irrelevant ones. This works because IsNullable and CanBeNullable should be true if
+ // they are true for any state in the set; SimulatesBacktracking is true for all the states if
+ // it is true for any state (since it is a phase-wide property); and all other flags are masked out.
+ StateFlags flags = 0;
+ foreach (ref KeyValuePair nfaState in CollectionsMarshal.AsSpan(state.NfaState!.NfaStateSet.Values))
{
- // Build the flags for the set of states by taking a bitwise Or of all the per-state flags and then
- // masking out the irrelevant ones. This works because IsNullable and CanBeNullable should be true if
- // they are true for any state in the set; SimulatesBacktracking is true for all the states if
- // it is true for any state (since it is a phase-wide property); and all other flags are masked out.
- StateFlags flags = 0;
- foreach (ref KeyValuePair nfaState in CollectionsMarshal.AsSpan(stateSet.Values))
- {
- flags |= matcher._stateFlagsArray[matcher.GetCoreStateId(nfaState.Key)];
- }
- return flags & (StateFlags.IsNullableFlag | StateFlags.CanBeNullableFlag | StateFlags.SimulatesBacktrackingFlag);
+ flags |= matcher._stateFlagsArray[matcher.GetCoreStateId(nfaState.Key)];
}
+
+ return flags & (StateFlags.IsNullableFlag | StateFlags.CanBeNullableFlag | StateFlags.SimulatesBacktrackingFlag);
}
#if DEBUG
@@ -1207,7 +1613,7 @@ private interface IInputReader
private readonly struct NoZAnchorInputReader : IInputReader
{
public static int GetPositionId(SymbolicRegexMatcher matcher, ReadOnlySpan input, int pos) =>
- (uint)pos >= (uint)input.Length ? -1 : matcher._mintermClassifier.GetMintermID(input[pos]);
+ (uint)pos < (uint)input.Length ? matcher._mintermClassifier.GetMintermID(input[pos]) : -1;
}
/// This reader includes full handling of an \n as the last character of input for the \Z anchor.
@@ -1215,41 +1621,103 @@ public static int GetPositionId(SymbolicRegexMatcher matcher, ReadOnlySpan
{
public static int GetPositionId(SymbolicRegexMatcher matcher, ReadOnlySpan input, int pos)
{
- if ((uint)pos >= (uint)input.Length)
- return -1;
-
- int c = input[pos];
+ if ((uint)pos < (uint)input.Length)
+ {
+ // Find the minterm, handling the special case for the last \n for states that start with a relevant anchor
+ int c = input[pos];
+ return c == '\n' && pos == input.Length - 1 ?
+ matcher._minterms.Length : // mintermId = minterms.Length represents an \n at the very end of input
+ matcher._mintermClassifier.GetMintermID(c);
+ }
- // Find the minterm, handling the special case for the last \n for states that start with a relevant anchor
- return c == '\n' && pos == input.Length - 1 ?
- matcher._minterms.Length : // mintermId = minterms.Length represents an \n at the very end of input
- matcher._mintermClassifier.GetMintermID(c);
+ return -1;
}
}
- ///
- /// Interface for optimizations to accelerate search from initial states.
- ///
private interface IInitialStateHandler
{
- public static abstract bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, ReadOnlySpan input, ref CurrentState state, ref int pos)
+ public static abstract bool TryFindNextStartingPosition(
+ SymbolicRegexMatcher matcher, ReadOnlySpan input,
+ ref CurrentState state, ref int pos)
where TInputReader : struct, IInputReader;
}
///
- /// No-op handler for when there are no initial state optimizations to apply.
+ /// Interface for accelerated states, returns true if position was changed
///
- private readonly struct NoOptimizationsInitialStateHandler : IInitialStateHandler
+ private interface IAcceleratedStateHandler
+ {
+ public static abstract bool TryFindNextStartingPosition(
+ SymbolicRegexMatcher matcher, ReadOnlySpan input,
+ byte[] lookup, ref int currentStateId, ref int pos, int initialStateId);
+ }
+
+ private readonly struct NoAnchorAcceleratedStateHandler : IAcceleratedStateHandler
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
- public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, ReadOnlySpan input, ref CurrentState state, ref int pos)
- where TInputReader : struct, IInputReader
+ public static bool TryFindNextStartingPosition(
+ SymbolicRegexMatcher matcher, ReadOnlySpan input, byte[] lookup, ref int currentStateId, ref int pos, int initialStateId)
{
- // return true to indicate that the current position is a possible starting position
+ if (currentStateId != initialStateId)
+ {
+ return false;
+ }
+
+ if (!matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0))
+ {
+ // No match exists
+ currentStateId = matcher._deadStateId;
+ pos = input.Length;
+ }
+
return true;
}
}
+ private readonly struct AcceleratedStateHandler : IAcceleratedStateHandler
+ {
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher,
+ ReadOnlySpan input,
+ byte[] lookup, ref int currentStateId, ref int pos, int initialStateId)
+ {
+ if (currentStateId != initialStateId)
+ {
+ return false;
+ }
+
+ if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0))
+ {
+ currentStateId = matcher._dotstarredInitialStates[matcher._positionKinds[GetMintermId(lookup, input, pos - 1) + 1]].Id;
+ }
+ else
+ {
+ // No match exists
+ currentStateId = matcher._deadStateId;
+ pos = input.Length;
+ }
+
+ return true;
+ }
+ }
+
+ private readonly struct NoAcceleratedStateHandler : IAcceleratedStateHandler
+ {
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static bool TryFindNextStartingPosition(
+ SymbolicRegexMatcher matcher, ReadOnlySpan input, byte[] lookup, ref int currentStateId, ref int pos, int initialStateId) =>
+ false;
+ }
+
+ /// No-op handler for when there are no initial state optimizations to apply.
+ private readonly struct NoOptimizationsInitialStateHandler : IInitialStateHandler
+ {
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, ReadOnlySpan input, ref CurrentState state, ref int pos)
+ where TInputReader : struct, IInputReader =>
+ true; // the current position is a possible starting position
+ }
+
///
/// Handler for when a instance is available.
///
@@ -1260,26 +1728,33 @@ public static bool TryFindNextStartingPosition(SymbolicRegexMatche
where TInputReader : struct, IInputReader
{
// Find the first position that matches with some likely character.
- if (!matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0))
+ if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0))
{
- // No match exists
- return false;
+ // Update the starting state based on where TryFindNextStartingPosition moved us to.
+ // As with the initial starting state, if it's a dead end, no match exists.
+ state = new CurrentState(matcher._dotstarredInitialStates[matcher.GetCharKind(input, pos - 1)]);
+ return true;
}
- // Update the starting state based on where TryFindNextStartingPosition moved us to.
- // As with the initial starting state, if it's a dead end, no match exists.
- state = new CurrentState(matcher._dotstarredInitialStates[matcher.GetCharKind(input, pos - 1)]);
- return true;
+ // No match exists
+ return false;
}
}
- ///
- /// Interface for evaluating nullability of states.
- ///
+ /// Interface for evaluating nullability of states.
private interface INullabilityHandler
{
- public static abstract bool IsNullableAt(SymbolicRegexMatcher matcher, in CurrentState state, int positionId, StateFlags flags)
- where TStateHandler : struct, IStateHandler;
+ public static abstract bool IsNullableAt(
+ SymbolicRegexMatcher matcher, in CurrentState state, int positionId, StateFlags flags)
+ where TStateHandler : struct, IStateHandler;
+ }
+
+ /// This nullability handler interface can be used in DFAs for patterns that do not contain \Z.
+ private interface IOptimizedNullabilityHandler
+ {
+ public static abstract bool IsNullable(
+ SymbolicRegexMatcher matcher, byte[] nullabilityArray, int currStateId,
+ byte[] lookup, ReadOnlySpan input, int pos);
}
///
@@ -1303,9 +1778,37 @@ public static bool IsNullableAt(SymbolicRegexMatcher matche
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool IsNullableAt(SymbolicRegexMatcher matcher, in CurrentState state, int positionId, StateFlags flags)
- where TStateHandler : struct, IStateHandler
+ where TStateHandler : struct, IStateHandler =>
+ flags.IsNullable() ||
+ (flags.CanBeNullable() && TStateHandler.IsNullableFor(matcher, in state, matcher.GetPositionKind(positionId)));
+ }
+
+ private readonly struct NoAnchorOptimizedNullabilityHandler : IOptimizedNullabilityHandler
+ {
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static bool IsNullable(SymbolicRegexMatcher matcher, byte[] nullabilityArray, int currStateId, byte[] lookup, ReadOnlySpan input, int pos)
{
- return flags.IsNullable() || (flags.CanBeNullable() && TStateHandler.IsNullableFor(matcher, in state, matcher.GetPositionKind(positionId)));
+ Debug.Assert(pos >= 0 && pos < input.Length, "input end should not be handled here");
+ Debug.Assert(currStateId < nullabilityArray.Length, "nullabilityArray grown but the reference is not up to date");
+ return nullabilityArray[currStateId] > 0;
+ }
+ }
+
+ private readonly struct AnchorOptimizedNullabilityHandler : IOptimizedNullabilityHandler
+ {
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static bool IsNullable(SymbolicRegexMatcher matcher, byte[] nullabilityArray, int currStateId, byte[] lookup, ReadOnlySpan input, int pos)
+ {
+ Debug.Assert(pos >= 0 && pos < input.Length, "input end should not be handled here");
+ Debug.Assert(currStateId < nullabilityArray.Length, "nullabilityArray grown but the reference is not up to date");
+
+ if (nullabilityArray[currStateId] > 0)
+ {
+ char c = input[pos];
+ return matcher.IsNullableWithContext(currStateId, c < (uint)lookup.Length ? lookup[c] : 0);
+ }
+
+ return false;
}
}
}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs
index a138c819be00fa..5384810092b7fc 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs
@@ -185,7 +185,7 @@ internal bool CanBeNullable
public List> ToList(List>? list = null, SymbolicRegexNodeKind listKind = SymbolicRegexNodeKind.Concat)
{
Debug.Assert(listKind is SymbolicRegexNodeKind.Concat or SymbolicRegexNodeKind.Alternate);
- list ??= new List>();
+ list ??= [];
AppendToList(this, list, listKind);
return list;
@@ -394,9 +394,11 @@ SymbolicRegexNodeKind.BoundaryAnchor or SymbolicRegexNodeKind.NonBoundaryAnchor
SymbolicRegexNodeKind.BeginningAnchor or SymbolicRegexNodeKind.EndAnchor or
SymbolicRegexNodeKind.EndAnchorZ or SymbolicRegexNodeKind.EndAnchorZReverse or
SymbolicRegexNodeKind.EOLAnchor or SymbolicRegexNodeKind.BOLAnchor);
- return Create(builder, kind, null, null, -1, -1, default, SymbolicRegexInfo.Anchor(isLineAnchor: kind is
- SymbolicRegexNodeKind.EndAnchorZ or SymbolicRegexNodeKind.EndAnchorZReverse or
- SymbolicRegexNodeKind.EOLAnchor or SymbolicRegexNodeKind.BOLAnchor));
+ return Create(
+ builder, kind, null, null, -1, -1, default,
+ SymbolicRegexInfo.Anchor(
+ isLineAnchor: kind is SymbolicRegexNodeKind.EndAnchorZ or SymbolicRegexNodeKind.EndAnchorZReverse or SymbolicRegexNodeKind.EOLAnchor or SymbolicRegexNodeKind.BOLAnchor,
+ isEndZAnchor: kind is SymbolicRegexNodeKind.EndAnchorZ));
}
#endregion
@@ -540,8 +542,8 @@ internal static SymbolicRegexNode CreateAlternate(SymbolicRegexBuilder> seenElems = new();
// Keep track of if any elements from the right side need to be eliminated
+ HashSet> seenElems = [];
bool rightChanged = false;
for (int i = 0; i < elems.Count; i++)
{
@@ -835,7 +837,7 @@ private static bool TryFoldAlternation(SymbolicRegexBuilder builder, Symbo
static bool TrySplitConcatSubsumption(SymbolicRegexBuilder builder, SymbolicRegexNode left, SymbolicRegexNode right,
[NotNullWhen(true)] out SymbolicRegexNode? prefix)
{
- List> prefixElements = new();
+ List> prefixElements = [];
SymbolicRegexNode suffix = right;
while (suffix._kind == SymbolicRegexNodeKind.Concat)
{
@@ -1051,7 +1053,7 @@ public SymbolicRegexNode AddFixedLengthMarkers(SymbolicRegexBuilder
/// the derivative
internal List<(SymbolicRegexNode, DerivativeEffect[])> CreateNfaDerivativeWithEffects(SymbolicRegexBuilder builder, TSet elem, uint context)
{
- List<(SymbolicRegexNode, DerivativeEffect[])> transitions = new();
+ List<(SymbolicRegexNode, DerivativeEffect[])> transitions = [];
CreateDerivativeWrapper(builder, elem, context).StripAndMapEffects(builder, context, transitions);
return transitions;
}
@@ -1084,9 +1086,8 @@ private SymbolicRegexNode PruneLowerPriorityThanNullability(SymbolicRegexB
return this;
// Cache result to avoid otherwise potential quadratic worst case behavior
- SymbolicRegexNode? prunedNode;
(SymbolicRegexNode, uint) key = (this, context);
- if (builder._pruneLowerPriorityThanNullabilityCache.TryGetValue(key, out prunedNode))
+ if (builder._pruneLowerPriorityThanNullabilityCache.TryGetValue(key, out SymbolicRegexNode? prunedNode))
{
return prunedNode;
}
@@ -1253,9 +1254,8 @@ private SymbolicRegexNode CreateDerivative(SymbolicRegexBuilder buil
return StackHelper.CallOnEmptyStack(CreateDerivative, builder, elem, context);
}
- SymbolicRegexNode? derivative;
(SymbolicRegexNode, TSet, uint) key = (this, elem, context);
- if (builder._derivativeCache.TryGetValue(key, out derivative))
+ if (builder._derivativeCache.TryGetValue(key, out SymbolicRegexNode? derivative))
{
return derivative;
}
@@ -1433,7 +1433,7 @@ internal void StripAndMapEffects(SymbolicRegexBuilder builder, uint contex
return;
}
- currentEffects ??= new List();
+ currentEffects ??= [];
// If we've reached a node with no effects, then output that with the effects that have been accumulated
if (!_info.ContainsEffect)
@@ -1468,7 +1468,7 @@ internal void StripAndMapEffects(SymbolicRegexBuilder builder, uint contex
_left.StripAndMapEffects(builder, context, alternativesAndEffects, currentEffects);
for (int i = oldAlternativesCount; i < alternativesAndEffects.Count; i++)
{
- var (node, effects) = alternativesAndEffects[i];
+ (SymbolicRegexNode node, DerivativeEffect[] effects) = alternativesAndEffects[i];
alternativesAndEffects[i] = (builder.CreateConcat(node, _right), effects);
}
break;
@@ -1506,7 +1506,7 @@ internal void StripAndMapEffects(SymbolicRegexBuilder builder, uint contex
_left.StripAndMapEffects(builder, context, alternativesAndEffects, currentEffects);
for (int i = oldAlternativesCount; i < alternativesAndEffects.Count; i++)
{
- var (node, effects) = alternativesAndEffects[i];
+ (SymbolicRegexNode node, DerivativeEffect[] effects) = alternativesAndEffects[i];
alternativesAndEffects[i] = (builder.CreateDisableBacktrackingSimulation(node), effects);
}
break;
@@ -1895,12 +1895,8 @@ private void CollectSets(SymbolicRegexBuilder builder, HashSet sets)
}
/// Compute and sort all the minterms from the sets in this regex.
- public TSet[] ComputeMinterms(SymbolicRegexBuilder builder)
- {
- HashSet sets = GetSets(builder);
- List minterms = MintermGenerator.GenerateMinterms(builder._solver, sets);
- return minterms.ToArray();
- }
+ public TSet[] ComputeMinterms(SymbolicRegexBuilder builder) =>
+ MintermGenerator.GenerateMinterms(builder._solver, GetSets(builder)).ToArray();
///
/// Create the reverse of this regex
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs
index fea9518b79b512..aa6708a60d01a9 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs
@@ -40,8 +40,8 @@ public SymbolicRegexRunnerFactory(RegexTree regexTree, RegexOptions options, Tim
BDD[] minterms = rootNode.ComputeMinterms(bddBuilder);
_matcher = minterms.Length > 64 ?
- SymbolicRegexMatcher.Create(regexTree.CaptureCount, regexTree.FindOptimizations, bddBuilder, rootNode, new BitVectorSolver(minterms, charSetSolver), matchTimeout) :
- SymbolicRegexMatcher.Create(regexTree.CaptureCount, regexTree.FindOptimizations, bddBuilder, rootNode, new UInt64Solver(minterms, charSetSolver), matchTimeout);
+ SymbolicRegexMatcher.Create(regexTree.CaptureCount, regexTree.FindOptimizations, bddBuilder, rootNode, new BitVectorSolver(minterms), matchTimeout) :
+ SymbolicRegexMatcher.Create(regexTree.CaptureCount, regexTree.FindOptimizations, bddBuilder, rootNode, new UInt64Solver(minterms), matchTimeout);
}
/// Creates a object.
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs
index 6057827e1d53fd..5d73a3e232e809 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs
@@ -8,33 +8,33 @@ namespace System.Text.RegularExpressions.Symbolic
///
internal static class SymbolicRegexThresholds
{
- /// Maximum number of built states before switching over to NFA mode.
+ /// Maximum number of instances before switching over to NFA mode.
///
/// By default, all matching starts out using DFAs, where every state transitions to one and only one
/// state for any minterm (each character maps to one minterm). Some regular expressions, however, can result
/// in really, really large DFA state graphs, much too big to actually store. Instead of failing when we
/// encounter such state graphs, at some point we instead switch from processing as a DFA to processing as
- /// an NFA. As an NFA, we instead track all of the states we're in at any given point, and transitioning
- /// from one "state" to the next really means for every constituent state that composes our current "state",
- /// we find all possible states that transitioning out of each of them could result in, and the union of
- /// all of those is our new "state". This constant represents the size of the graph after which we start
- /// processing as an NFA instead of as a DFA. This processing doesn't change immediately, however. All
- /// processing starts out in DFA mode, even if we've previously triggered NFA mode for the same regex.
- /// We switch over into NFA mode the first time a given traversal (match operation) results in us needing
- /// to create a new node and the graph is already or newly beyond this threshold.
+ /// an NFA. As an NFA, we instead track all of the states we're in at any given point.
///
- internal const int NfaThreshold = 10_000;
+ ///
+ /// This limit is chosen due to memory usage constraints, the largest possible memory allocation for a regex instance is currently ~50 MB.
+ /// Worst case memory consumption for the regex instance can be approximated to ~(NfaNodeCountThreshold * (sizeof(MatchingState) + sizeof(SymbolicRegexNode))
+ /// while it most cases the MatchingState part can be ignored, as only a subset of nodes have their own state.
+ ///
+ internal const int NfaNodeCountThreshold = 125_000;
///
/// Default maximum estimated safe expansion size of a AST
- /// after the AST has been anlayzed for safe handling.
+ /// after the AST has been analyzed for safe handling.
///
/// If the AST exceeds this threshold then is thrown.
/// This default value may be overridden with the AppContext data
/// whose name is given by .
///
+ /// This limit is chosen due to worst case NFA speed constraints,
+ /// although it could be safely raised higher at the expense of worst-case NFA performance.
///
- internal const int DefaultSymbolicRegexSafeSizeThreshold = 1000;
+ internal const int DefaultSymbolicRegexSafeSizeThreshold = 10_000;
///The environment variable name for a value overriding the default value
internal const string SymbolicRegexSafeSizeThreshold_ConfigKeyName = "REGEX_NONBACKTRACKING_MAX_AUTOMATA_SIZE";
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/UInt64Solver.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/UInt64Solver.cs
index 7664d6d03aa4a7..c65c00fd23413a 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/UInt64Solver.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/UInt64Solver.cs
@@ -12,12 +12,12 @@ internal sealed class UInt64Solver : ISolver
private readonly BDD[] _minterms;
internal readonly MintermClassifier _classifier;
- public UInt64Solver(BDD[] minterms, CharSetSolver solver)
+ public UInt64Solver(BDD[] minterms)
{
Debug.Assert(minterms.Length <= 64);
_minterms = minterms;
- _classifier = new MintermClassifier(minterms, solver);
+ _classifier = new MintermClassifier(minterms);
Full = minterms.Length == 64 ? ulong.MaxValue : ulong.MaxValue >> (64 - minterms.Length);
}
diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Ctor.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Ctor.Tests.cs
index cefad992523428..b9659996a4e517 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Ctor.Tests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Ctor.Tests.cs
@@ -133,7 +133,7 @@ public static void Ctor_Invalid()
Assert.Throws(() => new Regex(@"(?>a*)a", RegexHelpers.RegexOptionNonBacktracking)); // NonBacktracking and atomics
Assert.Throws(() => new Regex(@"\Ga", RegexHelpers.RegexOptionNonBacktracking)); // NonBacktracking and start anchors
Assert.Throws(() => new Regex(@"(?A)(?<-C>B)$", RegexHelpers.RegexOptionNonBacktracking)); // NonBacktracking and balancing groups
- Assert.Throws(() => new Regex(@"\w{1,1001}", RegexHelpers.RegexOptionNonBacktracking)); // Potentially large automata expansion
+ Assert.Throws(() => new Regex(@"\w{1,100001}", RegexHelpers.RegexOptionNonBacktracking)); // Potentially large automata expansion
}
}
diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
index 57780531253d35..1f0e2932c6425d 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
@@ -1402,7 +1402,7 @@ public void NonBacktracking_NoEndAnchorMatchAtTimeoutCheck()
{
// This constant must be at least as large as the one in the implementation that sets the maximum number
// of innermost loop iterations between timeout checks.
- const int CharsToTriggerTimeoutCheck = 10000;
+ const int CharsToTriggerTimeoutCheck = 200_000;
// Check that it is indeed large enough to trigger timeouts. If this fails the constant above needs to be larger.
Assert.Throws(() => new Regex("a*", RegexHelpers.RegexOptionNonBacktracking, TimeSpan.FromTicks(1))
.Match(new string('a', CharsToTriggerTimeoutCheck)));
@@ -2653,5 +2653,25 @@ public static IEnumerable