diff --git a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj index 2042b930fdd2c8..86353b31b5d7b7 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj +++ b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj @@ -72,6 +72,8 @@ + + diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs index 49205f5ee2649f..d2aec2621a81c8 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs @@ -5,7 +5,6 @@ using System.Collections.Generic; using System.Diagnostics; using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; #pragma warning disable CS8500 // takes address of managed type diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/BitVectorSolver.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/BitVectorSolver.cs index 09db2948d717be..b30527871e2bb4 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/BitVectorSolver.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/BitVectorSolver.cs @@ -10,11 +10,11 @@ internal sealed class BitVectorSolver : ISolver internal readonly MintermClassifier _classifier; private readonly BitVector[] _mintermVectors; - public BitVectorSolver(BDD[] minterms, CharSetSolver solver) + public BitVectorSolver(BDD[] minterms) { _minterms = minterms; - _classifier = new MintermClassifier(minterms, solver); + _classifier = new MintermClassifier(minterms); var singleBitVectors = new BitVector[minterms.Length]; for (int i = 0; i < singleBitVectors.Length; i++) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs new file mode 100644 index 00000000000000..2ea1ea8af7422c --- /dev/null +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs @@ -0,0 +1,39 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Diagnostics; + +namespace System.Text.RegularExpressions.Symbolic +{ + /// Provides details on how a match may be processed in reverse to find the beginning of a match once a match's existence has been confirmed. + internal readonly struct MatchReversalInfo where TSet : IComparable, IEquatable + { + /// Initializes the match reversal details. + internal MatchReversalInfo(MatchReversalKind kind, int fixedLength, MatchingState? adjustedStartState = null) + { + Debug.Assert(kind is MatchReversalKind.MatchStart or MatchReversalKind.FixedLength or MatchReversalKind.PartialFixedLength); + Debug.Assert(fixedLength >= 0); + Debug.Assert((adjustedStartState is not null) == (kind is MatchReversalKind.PartialFixedLength)); + + Kind = kind; + FixedLength = fixedLength; + AdjustedStartState = adjustedStartState; + } + + /// Gets the kind of the match reversal processing required. + internal MatchReversalKind Kind { get; } + + /// Gets the fixed length of the match, if one is known. + /// + /// For , this is ignored. + /// For , this is the full length of the match. The beginning may be found simply + /// by subtracting this length from the end. + /// For , this is the length of fixed portion of the match. + /// + internal int FixedLength { get; } + + /// Gets the adjusted start state to use for partial fixed-length matches. + /// This will be non-null iff is . + internal MatchingState? AdjustedStartState { get; } + } +} diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversalKind.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversalKind.cs new file mode 100644 index 00000000000000..a949e6204a16a3 --- /dev/null +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversalKind.cs @@ -0,0 +1,26 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +namespace System.Text.RegularExpressions.Symbolic +{ + /// Specifies the kind of a . + internal enum MatchReversalKind + { + /// The regex should be run in reverse to find beginning of the match. + MatchStart, + + /// The end of the pattern is of a fixed length and can be skipped as part of running a regex in reverse to find the beginning of the match. + /// + /// Reverse execution is not necessary for a subset of the match. + /// will contain the length of the fixed portion. + /// + PartialFixedLength, + + /// The entire pattern is of a fixed length. + /// + /// Reverse execution is not necessary to find the beginning of the match. + /// will contain the length of the match. + /// + FixedLength + } +} diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs index dce65a9996330d..3aacc4a61cbb94 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs @@ -14,6 +14,7 @@ internal MatchingState(SymbolicRegexNode node, uint prevCharKind) { Node = node; PrevCharKind = prevCharKind; + NullabilityInfo = BuildNullabilityInfo(); } /// The regular expression that labels this state and gives it its semantics. @@ -95,21 +96,37 @@ internal SymbolicRegexNode Next(SymbolicRegexBuilder builder, TSet m return Node.CreateNfaDerivativeWithEffects(builder, minterm, context); } + /// Determines whether the node is nullable for the given context. + /// + /// This is functionally equivalent to , but using cached + /// answers stored in . + /// [MethodImpl(MethodImplOptions.AggressiveInlining)] internal bool IsNullableFor(uint nextCharKind) { - Debug.Assert(CharKind.IsValidCharKind(nextCharKind)); - uint context = CharKind.Context(PrevCharKind, nextCharKind); - return Node.IsNullableFor(context); + Debug.Assert(nextCharKind is >= 0 and < CharKind.CharKindCount); + return (NullabilityInfo & (1 << (int)nextCharKind)) != 0; } + /// Gets the nullability info for the matching state. + /// + /// + /// 00000 -> node cannot be nullable + /// 00001 -> nullable for General + /// 00010 -> nullable for BeginningEnd + /// 00100 -> nullable for NewLine + /// 01000 -> nullable for NewLineS + /// 10000 -> nullable for WordLetter + /// + /// + internal int NullabilityInfo { get; } + /// /// Builds a with the relevant flags set. /// - /// a solver for /// whether this state is an initial state /// the flags for this matching state - internal StateFlags BuildStateFlags(ISolver solver, bool isInitial) + internal StateFlags BuildStateFlags(bool isInitial) { StateFlags info = 0; @@ -118,11 +135,6 @@ internal StateFlags BuildStateFlags(ISolver solver, bool isInitial) info |= StateFlags.IsInitialFlag; } - if (IsDeadend(solver)) - { - info |= StateFlags.IsDeadendFlag; - } - if (Node.CanBeNullable) { info |= StateFlags.CanBeNullableFlag; @@ -140,6 +152,22 @@ internal StateFlags BuildStateFlags(ISolver solver, bool isInitial) return info; } + /// Builds the nullability information for the matching state. + /// Nullability for each context is encoded in a bit. See . + private byte BuildNullabilityInfo() + { + byte nullabilityInfo = 0; + if (Node.CanBeNullable) + { + for (uint charKind = 0; charKind < CharKind.CharKindCount; charKind++) + { + nullabilityInfo |= (byte)(Node.IsNullableFor(CharKind.Context(PrevCharKind, charKind)) ? 1 << (int)charKind : 0); + } + } + + return nullabilityInfo; + } + public override bool Equals(object? obj) => obj is MatchingState s && PrevCharKind == s.PrevCharKind && Node.Equals(s.Node); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs index d00fcc0d62ff40..24d2a26f849229 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs @@ -1,7 +1,9 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Buffers; using System.Diagnostics; +using System.Numerics; using System.Runtime.CompilerServices; namespace System.Text.RegularExpressions.Symbolic @@ -20,81 +22,104 @@ namespace System.Text.RegularExpressions.Symbolic /// internal sealed class MintermClassifier { - /// An array used when there's a single minterm, in order to map every ASCII character to it trivially. - private static readonly int[] AllAsciiIsZeroMintermArray = new int[128]; + /// Mapping for characters to minterms, used in the vast majority case when there are less than 256 minterms. + /// _lookup[char] provides the minterm ID. If char >= _lookup.Length, its minterm is 0. + private readonly byte[]? _lookup; - /// Array providing fast mapping from an ASCII character (the array index) to its corresponding minterm ID. - private readonly int[] _ascii; - /// A multi-terminal BDD for mapping any non-ASCII character to its associated minterm ID. - /// - /// The use of a multi-terminal BDD here is an implementation detail. Should we decide its important to optimize non-ASCII inputs further, - /// or to consolidate the mechanism with the other engines, an alternatie lookup algorithm / data structure could be employed. - /// - private readonly BDD _nonAscii; + /// Mapping for characters to minterms, used when there are at least 256 minterms. This is rarely used. + /// _intLookup[char] provides the minterm ID. If char >= _intLookup.Length, its minterm is 0. + private readonly int[]? _intLookup; /// Create a classifier that maps a character to the ID of its associated minterm. /// A BDD for classifying all characters (ASCII and non-ASCII) to their corresponding minterm IDs. - /// The character set solver to use. - public MintermClassifier(BDD[] minterms, CharSetSolver solver) + public MintermClassifier(BDD[] minterms) { Debug.Assert(minterms.Length > 0, "Requires at least"); if (minterms.Length == 1) { // With only a single minterm, the mapping is trivial: everything maps to it (ID 0). - // For ASCII, use an array containing all zeros. For non-ASCII, use a BDD that maps everything to 0. - _ascii = AllAsciiIsZeroMintermArray; - _nonAscii = solver.ReplaceTrue(BDD.True, 0); + _lookup = []; return; } - // Create a multi-terminal BDD for mapping any character to its associated minterm. - BDD anyCharacterToMintermId = BDD.False; - for (int i = 0; i < minterms.Length; i++) - { - // Each supplied minterm BDD decides whether a given character maps to it or not. - // We need to combine all of those into a multi-terminal BDD that decides which - // minterm a character maps to. To do that, we take each minterm BDD and replace - // its True result with the ID of the minterm, such that a character that would - // have returned True for that BDD now returns the minterm ID. - BDD charToTargetMintermId = solver.ReplaceTrue(minterms[i], i); + // Compute all minterm ranges. We do this here in order to determine the maximum character value + // in order to size the lookup array to minimize steady-state memory consumption of the potentially + // large lookup array. We prefer to use the byte[] _lookup when possible, in order to keep memory + // consumption to a minimum; doing so accomodates up to 255 minterms, which is the vast majority case. + // However, when there are more than 255 minterms, we need to use int[] _intLookup. + (uint, uint)[][] charRangesPerMinterm = ArrayPool<(uint, uint)[]>.Shared.Rent(minterms.Length); - // Now union this BDD with the multi-terminal BDD we've built up thus far. Unioning - // is valid because every character belongs to exactly one minterm and thus will - // only map to an ID instead of False in exactly one of the input BDDs. - anyCharacterToMintermId = solver.Or(anyCharacterToMintermId, charToTargetMintermId); + int maxChar = -1; + for (int mintermId = 1; mintermId < minterms.Length; mintermId++) + { + (uint, uint)[] ranges = BDDRangeConverter.ToRanges(minterms[mintermId]); + charRangesPerMinterm[mintermId] = ranges; + maxChar = Math.Max(maxChar, (int)ranges[^1].Item2); } - // Now that we have our mapping that supports any input character, we want to optimize for - // ASCII inputs. Rather than forcing every input ASCII character to consult the BDD at match - // time, we precompute a lookup table, where each ASCII character can be used to index into the - // array to determine the ID for its corresponding minterm. - var ascii = new int[128]; - for (int i = 0; i < ascii.Length; i++) + // It's incredibly rare for a regex to use more than a couple hundred minterms, + // but we need a fallback just in case. (Over 128 unique sets also means it's never ASCII only.) + if (minterms.Length > 255) + { + _intLookup = CreateLookup(minterms, charRangesPerMinterm, maxChar); + } + else { - ascii[i] = anyCharacterToMintermId.Find(i); + _lookup = CreateLookup(minterms, charRangesPerMinterm, maxChar); } - _ascii = ascii; - // We can also further optimize the BDD in two ways: - // 1. We can now remove the ASCII characters from it, as we'll always consult the lookup table first - // for ASCII inputs and thus will never use the BDD for them. While optional (skipping this step will not - // affect correctness), removing the ASCII values from the BDD reduces the size of the multi-terminal BDD. - // 2. We can check if every character now maps to the same minterm ID (the same terminal in the - // multi-terminal BDD). This can be relatively common after (1) above is applied, as many - // patterns don't distinguish between any non-ASCII characters (e.g. "[0-9]*"). If every character - // in the BDD now maps to the same minterm, we can replace the BDD with a much simpler/faster/smaller one. - BDD nonAsciiBDD = solver.And(anyCharacterToMintermId, solver.NonAscii); - nonAsciiBDD = nonAsciiBDD.IsEssentiallyBoolean(out BDD? singleTerminalBDD) ? singleTerminalBDD : nonAsciiBDD; - _nonAscii = nonAsciiBDD; + // Return the rented array. We clear it before returning it in order to avoid all the ranges arrays being kept alive. + Array.Clear(charRangesPerMinterm, 0, minterms.Length); + ArrayPool<(uint, uint)[]>.Shared.Return(charRangesPerMinterm); + + // Creates the lookup array. + static T[] CreateLookup(BDD[] minterms, ReadOnlySpan<(uint, uint)[]> charRangesPerMinterm, int _maxChar) where T : IBinaryInteger + { + T[] lookup = new T[_maxChar + 1]; + for (int mintermId = 1; mintermId < minterms.Length; mintermId++) + { + // Each minterm maps to a range of characters. Set each of the characters in those ranges to the corresponding minterm. + foreach ((uint start, uint end) in charRangesPerMinterm[mintermId]) + { + lookup.AsSpan((int)start, (int)(end + 1 - start)).Fill(T.CreateTruncating(mintermId)); + } + } + + return lookup; + } } - /// Gets the ID of the minterm associated with the specified character. + /// Gets the ID of the minterm associated with the specified character. [MethodImpl(MethodImplOptions.AggressiveInlining)] public int GetMintermID(int c) { - int[] ascii = _ascii; - return (uint)c < (uint)ascii.Length ? ascii[c] : _nonAscii.Find(c); + if (_lookup is not null) + { + byte[] lookup = _lookup; + return (uint)c < (uint)lookup.Length ? lookup[c] : 0; + } + else + { + int[] lookup = _intLookup!; + return (uint)c < (uint)lookup.Length ? lookup[c] : 0; + } } + /// + /// Gets a quick mapping from char to minterm for the common case when there are <= 255 minterms. + /// Null if there are greater than 255 minterms. + /// + public byte[]? ByteLookup => _lookup; + + /// + /// Gets a mapping from char to minterm for the rare case when there are >= 255 minterms. + /// Null in the common case where there are fewer than 255 minterms. + /// + public int[]? IntLookup => _intLookup; + + /// + /// Maximum ordinal character for a non-0 minterm, used to conserve memory + /// + public int MaxChar => (_lookup?.Length ?? _intLookup!.Length) - 1; } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs index 5a620f3771be6f..b446fecdca28f5 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StateFlags.cs @@ -6,18 +6,18 @@ namespace System.Text.RegularExpressions.Symbolic /// /// These flags provide context-independent information available for every state. They provide a fast way to evaluate /// conditions in the inner matching loops of . The matcher caches one of these - /// for every state, for which they are created by . + /// for every state, for which they are created by . /// In DFA mode the cached flags are used directly, while in NFA mode the /// handles aggregating the flags in the state set. /// [Flags] internal enum StateFlags : byte { + None = 0, IsInitialFlag = 1, - IsDeadendFlag = 2, - IsNullableFlag = 4, - CanBeNullableFlag = 8, - SimulatesBacktrackingFlag = 16, + IsNullableFlag = 2, + CanBeNullableFlag = 4, + SimulatesBacktrackingFlag = 8, } /// @@ -25,10 +25,9 @@ internal enum StateFlags : byte /// internal static class StateFlagsExtensions { - internal static bool IsInitial(this StateFlags info) => (info & StateFlags.IsInitialFlag) != 0; - internal static bool IsDeadend(this StateFlags info) => (info & StateFlags.IsDeadendFlag) != 0; - internal static bool IsNullable(this StateFlags info) => (info & StateFlags.IsNullableFlag) != 0; - internal static bool CanBeNullable(this StateFlags info) => (info & StateFlags.CanBeNullableFlag) != 0; - internal static bool SimulatesBacktracking(this StateFlags info) => (info & StateFlags.SimulatesBacktrackingFlag) != 0; + internal static bool IsInitial(this StateFlags info) => (info & StateFlags.IsInitialFlag) != StateFlags.None; + internal static bool IsNullable(this StateFlags info) => (info & StateFlags.IsNullableFlag) != StateFlags.None; + internal static bool CanBeNullable(this StateFlags info) => (info & StateFlags.CanBeNullableFlag) != StateFlags.None; + internal static bool SimulatesBacktracking(this StateFlags info) => (info & StateFlags.SimulatesBacktrackingFlag) != StateFlags.None; } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs index ff95195292bfa4..b0aa0cd6e938de 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs @@ -17,6 +17,7 @@ namespace System.Text.RegularExpressions.Symbolic private const uint IsHighPriorityNullableMask = 64; private const uint ContainsEffectMask = 128; private const uint ContainsLineAnchorMask = 256; + private const uint ContainsEndZAnchorMask = 512; private readonly uint _info; @@ -26,7 +27,7 @@ private static SymbolicRegexInfo Create( bool isAlwaysNullable = false, bool canBeNullable = false, bool startsWithLineAnchor = false, bool containsLineAnchor = false, bool startsWithSomeAnchor = false, bool containsSomeAnchor = false, - bool isHighPriorityNullable = false, bool containsEffect = false) + bool isHighPriorityNullable = false, bool containsEffect = false, bool containsEndZAnchor = false) { // Assert that the expected implications hold. For example, every node that contains a line anchor // must also be marked as containing some anchor. @@ -43,7 +44,8 @@ private static SymbolicRegexInfo Create( (startsWithSomeAnchor ? StartsWithSomeAnchorMask : 0) | (containsSomeAnchor ? ContainsSomeAnchorMask : 0) | (isHighPriorityNullable ? IsHighPriorityNullableMask : 0) | - (containsEffect ? ContainsEffectMask : 0)); + (containsEffect ? ContainsEffectMask : 0) | + (containsEndZAnchor ? ContainsEndZAnchorMask : 0)); } public bool IsNullable => (_info & IsAlwaysNullableMask) != 0; @@ -63,6 +65,7 @@ private static SymbolicRegexInfo Create( public bool IsHighPriorityNullable => (_info & IsHighPriorityNullableMask) != 0; public bool ContainsEffect => (_info & ContainsEffectMask) != 0; + public bool ContainsEndZAnchor => (_info & ContainsEndZAnchorMask) != 0; /// /// Used for any node that acts as an epsilon, i.e., something that always matches the empty string. @@ -77,13 +80,15 @@ public static SymbolicRegexInfo Epsilon() => /// Used for all anchors. /// /// whether this anchor is a line anchor - public static SymbolicRegexInfo Anchor(bool isLineAnchor) => + /// whether this anchor is an end Z anchor + public static SymbolicRegexInfo Anchor(bool isLineAnchor, bool isEndZAnchor) => Create( canBeNullable: true, startsWithLineAnchor: isLineAnchor, containsLineAnchor: isLineAnchor, startsWithSomeAnchor: true, - containsSomeAnchor: true); + containsSomeAnchor: true, + containsEndZAnchor: isEndZAnchor); /// /// The alternation remains high priority nullable if the left alternative is so. @@ -99,7 +104,8 @@ public static SymbolicRegexInfo Alternate(SymbolicRegexInfo left_info, SymbolicR startsWithSomeAnchor: left_info.StartsWithSomeAnchor || right_info.StartsWithSomeAnchor, containsSomeAnchor: left_info.ContainsSomeAnchor || right_info.ContainsSomeAnchor, isHighPriorityNullable: left_info.IsHighPriorityNullable, - containsEffect: left_info.ContainsEffect || right_info.ContainsEffect); + containsEffect: left_info.ContainsEffect || right_info.ContainsEffect, + containsEndZAnchor: left_info.ContainsEndZAnchor || right_info.ContainsEndZAnchor); /// /// Concatenation remains high priority nullable if both left and right are so. @@ -115,7 +121,9 @@ public static SymbolicRegexInfo Concat(SymbolicRegexInfo left_info, SymbolicRege startsWithSomeAnchor: left_info.StartsWithSomeAnchor || (left_info.CanBeNullable && right_info.StartsWithSomeAnchor), containsSomeAnchor: left_info.ContainsSomeAnchor || right_info.ContainsSomeAnchor, isHighPriorityNullable: left_info.IsHighPriorityNullable && right_info.IsHighPriorityNullable, - containsEffect: left_info.ContainsEffect || right_info.ContainsEffect); + containsEffect: left_info.ContainsEffect || right_info.ContainsEffect, + containsEndZAnchor: left_info.ContainsEndZAnchor || right_info.ContainsEndZAnchor + ); /// /// Inherits anchor visibility from the loop body. diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs index 278b69fe391fef..327f5666f9e2a5 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs @@ -4,6 +4,7 @@ using System.Collections.Generic; using System.Diagnostics; using System.Diagnostics.CodeAnalysis; +using System.Runtime.CompilerServices; using System.Threading; namespace System.Text.RegularExpressions.Symbolic @@ -25,7 +26,7 @@ internal sealed partial class SymbolicRegexMatcher /// Cache for the states that have been created. Each state is uniquely identified by its associated /// and the kind of the previous character. /// - private readonly Dictionary<(SymbolicRegexNode Node, uint PrevCharKind), MatchingState> _stateCache = new(); + private readonly Dictionary<(SymbolicRegexNode Node, uint PrevCharKind), MatchingState> _stateCache = []; /// /// Maps state ids to states, initial capacity is given by . @@ -40,6 +41,14 @@ internal sealed partial class SymbolicRegexMatcher /// private StateFlags[] _stateFlagsArray; + /// Cached nullability info for each state ID. + /// + /// _nullabilityArray[stateId] == the for that state. + /// Used to short-circuit nullability in the hot loop. + /// Important: the pattern must not contain endZ for this to be valid. + /// + private byte[] _nullabilityArray; + /// /// The transition function for DFA mode. /// Each state has a range of consecutive entries for each minterm ID. A range of size 2^L, where L is @@ -69,7 +78,7 @@ internal sealed partial class SymbolicRegexMatcher /// It is the inverse of used entries in _nfaStateArray. /// The range of this map is 0 to its size - 1. /// - private readonly Dictionary _nfaIdByCoreId = new(); + private readonly Dictionary _nfaIdByCoreId = []; /// /// Transition function for NFA transitions in NFA mode. @@ -107,6 +116,13 @@ private static void ArrayResizeAndVolatilePublish(ref T[] array, int newSize) private int DeltaOffset(int stateId, int mintermId) => (stateId << _mintermsLog) | mintermId; + /// + /// Pre-computed hot-loop version of nullability check + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool IsNullableWithContext(int stateId, int mintermId) => + (_nullabilityArray[stateId] & (1 << (int)GetPositionKind(mintermId))) > 0; + /// Returns the span from that may contain transitions for the given state private Span GetDeltasFor(MatchingState state) { @@ -152,6 +168,78 @@ private MatchingState GetOrCreateState(SymbolicRegexNode node, uint return GetOrCreateState_NoLock(node, prevCharKind); } + /// + /// Analyze the specified reversed pattern to gather details that help to optimize the reverse matching process + /// for when finding the beginning of a match. + /// + /// + /// Optimized reversal state computation during construction which skips the fixed length suffix, e.g. for the pattern abc.*def + /// 1) the end is found at abc.*def| + /// 2) the reversal starts at abc.*| + /// + /// Reversed initial pattern + /// The match reversal details. + private MatchReversalInfo CreateOptimizedReversal(SymbolicRegexNode node) + { + int pos = 0; + while (true) + { + if (node._info.ContainsSomeAnchor) + { + // Bail if it contains any anchors as it invalidates the optimization. + // (This could potentially be a very good future optimization for anchors but there's too many edge cases to guarantee it works. + // One example which fails currently: pattern: @"\By\b", input: "xy") + pos = 0; + break; + } + + if (node._kind is not SymbolicRegexNodeKind.Concat) + { + if (node._kind is SymbolicRegexNodeKind.CaptureStart) + { + node = _builder.Epsilon; // The entire match is fixed length. + } + break; + } + + SymbolicRegexNode? left = node._left; + Debug.Assert(left is not null); + + if (left._kind is SymbolicRegexNodeKind.CaptureEnd or SymbolicRegexNodeKind.BoundaryAnchor or SymbolicRegexNodeKind.Singleton) + { + node = node._right!; + if (left._kind is SymbolicRegexNodeKind.Singleton) + { + pos++; + } + } + else if (left._kind is SymbolicRegexNodeKind.Loop) + { + if (left._lower <= 0 || left._left!.Kind is not SymbolicRegexNodeKind.Singleton) + { + break; + } + + node = left._lower == left._upper ? + node._right! : // The entire loop is fixed + _builder.CreateConcat( // Subtract the fixed part of the loop. + _builder.CreateLoop(left._left, left.IsLazy, 0, left._upper - left._lower), + node._right!); + pos += left._lower; + } + else + { + break; + } + } + + Debug.Assert(pos >= 0); + return + pos == 0 ? new MatchReversalInfo(MatchReversalKind.MatchStart, 0) : + node == _builder.Epsilon ? new MatchReversalInfo(MatchReversalKind.FixedLength, pos) : + new MatchReversalInfo(MatchReversalKind.PartialFixedLength, pos, GetOrCreateState_NoLock(_builder.CreateDisableBacktrackingSimulation(node), 0)); + } + /// /// Create a state with given node and previous character context. /// @@ -178,9 +266,11 @@ private MatchingState GetOrCreateState_NoLock(SymbolicRegexNode node ArrayResizeAndVolatilePublish(ref _stateArray, newsize); ArrayResizeAndVolatilePublish(ref _dfaDelta, newsize << _mintermsLog); ArrayResizeAndVolatilePublish(ref _stateFlagsArray, newsize); + ArrayResizeAndVolatilePublish(ref _nullabilityArray, newsize); } _stateArray[state.Id] = state; - _stateFlagsArray[state.Id] = state.BuildStateFlags(Solver, isInitialState); + _stateFlagsArray[state.Id] = state.BuildStateFlags(isInitialState); + _nullabilityArray[state.Id] = (byte)state.NullabilityInfo; } return state; @@ -266,17 +356,18 @@ private int GetCoreStateId(int nfaStateId) /// Gets or creates a new DFA transition. /// This function locks the matcher for safe concurrent use of the private bool TryCreateNewTransition( - MatchingState sourceState, int mintermId, int offset, bool checkThreshold, [NotNullWhen(true)] out MatchingState? nextState) + MatchingState sourceState, int mintermId, int offset, bool checkThreshold, [NotNullWhen(true)] out MatchingState? nextState, + long timeoutOccursAt = 0) { Debug.Assert(offset < _dfaDelta.Length); - lock (this) { // check if meanwhile delta[offset] has become defined possibly by another thread MatchingState? targetState = _stateArray[_dfaDelta[offset]]; if (targetState is null) { - if (checkThreshold && _stateCache.Count >= SymbolicRegexThresholds.NfaThreshold) + if ((timeoutOccursAt != 0 && Environment.TickCount64 > timeoutOccursAt) || // if there's an active timer + (checkThreshold && _builder._nodeCache.Count >= SymbolicRegexThresholds.NfaNodeCountThreshold)) // if # of nodes exceeds the NFA threshold { nextState = null; return false; @@ -312,10 +403,10 @@ private int[] CreateNewNfaTransition(int nfaStateId, int mintermId, int nfaOffse MatchingState coreState = GetState(coreId); TSet minterm = GetMintermFromId(mintermId); uint nextCharKind = GetPositionKind(mintermId); - SymbolicRegexNode? targetNode = coreTargetId > 0 ? + SymbolicRegexNode targetNode = coreTargetId > 0 ? GetState(coreTargetId).Node : coreState.Next(_builder, minterm, nextCharKind); - List targetsList = new(); + List targetsList = []; ForEachNfaState(targetNode, nextCharKind, targetsList, static (int nfaId, List targetsList) => targetsList.Add(nfaId)); @@ -342,8 +433,9 @@ private int[] CreateNewNfaTransition(int nfaStateId, int mintermId, int nfaOffse TSet minterm = GetMintermFromId(mintermId); uint nextCharKind = GetPositionKind(mintermId); List<(SymbolicRegexNode Node, DerivativeEffect[] Effects)>? transition = coreState.NfaNextWithEffects(_builder, minterm, nextCharKind); + // Build the new state and store it into the array. - List<(int, DerivativeEffect[])> targetsList = new(); + List<(int, DerivativeEffect[])> targetsList = []; foreach ((SymbolicRegexNode Node, DerivativeEffect[] Effects) entry in transition) { ForEachNfaState(entry.Node, nextCharKind, (targetsList, entry.Effects), diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 4394329f8eae21..08f423b03344ad 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -81,6 +81,20 @@ internal sealed partial class SymbolicRegexMatcher : SymbolicRegexMatcher /// Data and routines for skipping ahead to the next place a match could potentially start. private readonly RegexFindOptimizations? _findOpts; + /// + /// Dead end state to quickly return NoMatch. + /// + private readonly int _deadStateId; + + /// Initial state used for vectorization. + private readonly int _initialStateId; + + /// Whether the pattern contains any anchor. + private readonly bool _containsAnyAnchor; + + /// Whether the pattern contains the EndZ anchor, which invalidates most optimization shortcuts. + private readonly bool _containsEndZAnchor; + /// The initial states for the original pattern, keyed off of the previous character kind. /// If the pattern doesn't contain any anchors, there will only be a single initial state. private readonly MatchingState[] _initialStates; @@ -93,6 +107,9 @@ internal sealed partial class SymbolicRegexMatcher : SymbolicRegexMatcher /// If the pattern doesn't contain any anchors, there will only be a single initial state. private readonly MatchingState[] _reverseInitialStates; + /// Details on optimized processing of the reverse of the pattern to find the beginning of a match. + private readonly MatchReversalInfo _optimizedReversalInfo; + /// Partition of the input space of sets. private readonly TSet[] _minterms; @@ -169,9 +186,10 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder builder, SymbolicRegexNo ((BitVectorSolver)(object)builder._solver)._classifier; _capsize = captureCount; - // Initialization for fields in SymbolicRegexMatcher.Automata.cs + // Initialize state and nullability arrays. _stateArray = new MatchingState[InitialDfaStateCapacity]; _stateFlagsArray = new StateFlags[InitialDfaStateCapacity]; + _nullabilityArray = new byte[InitialDfaStateCapacity]; _dfaDelta = new int[InitialDfaStateCapacity << _mintermsLog]; // Initialize a lookup array for the character kinds of each minterm ID. This includes one "special" minterm @@ -183,6 +201,9 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder builder, SymbolicRegexNo _positionKinds[mintermId + 1] = CalculateMintermIdKind(mintermId); } + // Gather optimized reversal processing information. + _optimizedReversalInfo = CreateOptimizedReversal(_pattern.Reverse(builder)); + // Store the find optimizations that can be used to jump ahead to the next possible starting location. // If there's a leading beginning anchor, the find optimizations are unnecessary on top of the DFA's // handling for beginning anchors. @@ -199,6 +220,10 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder builder, SymbolicRegexNo // The loops below and how character kinds are calculated assume that the "general" character kind is zero Debug.Assert(CharKind.General == 0); + // Assign edge case info for quick lookup + _containsAnyAnchor = _pattern._info.ContainsSomeAnchor; + _containsEndZAnchor = _pattern._info.ContainsEndZAnchor; + // Create the initial states for the original pattern. var initialStates = new MatchingState[statesCount]; for (uint charKind = 0; charKind < initialStates.Length; charKind++) @@ -221,6 +246,10 @@ private SymbolicRegexMatcher(SymbolicRegexBuilder builder, SymbolicRegexNo } _dotstarredInitialStates = dotstarredInitialStates; + // Assign dead and initial state ids + _deadStateId = GetOrCreateState_NoLock(_builder._nothing, 0).Id; + _initialStateId = _dotstarredInitialStates[CharKind.General].Id; + // Create the reverse pattern (the original pattern in reverse order) and all of its // initial states. Also disable backtracking simulation to ensure the reverse path from // the final state that was found is followed. Not doing so might cause the earliest @@ -342,18 +371,27 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i // As an example, consider the pattern a{1,3}(b*) run against an input of aacaaaabbbc: phase 1 will find // the position of the last b: aacaaaabbbc. It additionally records the position of the first a after // the c as the low boundary for the starting position. - int matchStartLowBoundary, matchStartLengthMarker; - int matchEnd = (_pattern._info.ContainsLineAnchor, _findOpts is not null, _pattern._info.ContainsSomeAnchor) switch + + // The Z anchor and over 255 minterms are rare enough to consider them separate edge cases. + int matchEnd; + if (!_containsEndZAnchor && _mintermClassifier.IntLookup is null) { - (true, true, true) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), - (true, true, false) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), - (true, false, true) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), - (true, false, false) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), - (false, true, true) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), - (false, true, false) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), - (false, false, true) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), - (false, false, false) => FindEndPosition(input, startat, timeoutOccursAt, mode, out matchStartLowBoundary, out matchStartLengthMarker, perThreadData), - }; + // Optimize processing for the common case of no Z anchor and <= 255 minterms. Specialize each call with different generic method arguments. + matchEnd = (_findOpts is not null, _containsAnyAnchor) switch + { + (true, true) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), + (true, false) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), + (false, false) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), + (false, true) => FindEndPositionOptimized(input, startat, timeoutOccursAt, mode, perThreadData), + }; + } + else + { + // Fallback for Z anchor or over 255 minterms + matchEnd = _findOpts is not null ? + FindEndPositionFallback(input, startat, timeoutOccursAt, mode, perThreadData) : + FindEndPositionFallback(input, startat, timeoutOccursAt, mode, perThreadData); + } // If there wasn't a match, we're done. if (matchEnd == NoMatchExists) @@ -374,22 +412,57 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i // recorded a fixed-length marker for the portion of the pattern that matched, as we can then jump that // exact number of positions backwards. Continuing the previous example, phase 2 will walk backwards from // that last b until it finds the 4th a: aaabbbc. - int matchStart; - if (matchStartLengthMarker >= 0) - { - matchStart = matchEnd - matchStartLengthMarker; - } - else + int matchStart = 0; + Debug.Assert(matchEnd >= startat - 1); + switch (_optimizedReversalInfo.Kind) { - Debug.Assert(matchEnd >= startat - 1); - matchStart = matchEnd < startat ? - startat : (_pattern._info.ContainsLineAnchor, _pattern._info.ContainsSomeAnchor) switch + case MatchReversalKind.MatchStart: + case MatchReversalKind.PartialFixedLength: + int initialLastStart = -1; // invalid sentinel value + int i = matchEnd; + CurrentState reversalStartState; + + if (_optimizedReversalInfo.Kind is MatchReversalKind.MatchStart) + { + // No fixed-length knowledge. Start at the end of the match. + reversalStartState = new CurrentState(_reverseInitialStates[GetCharKind(input, matchEnd)]); + } + else + { + // There's a fixed-length portion at the end of the match. Start just before it. + i -= _optimizedReversalInfo.FixedLength; + reversalStartState = new CurrentState(_optimizedReversalInfo.AdjustedStartState!); + + // reversal may already be nullable here in the case of anchors + if (_containsAnyAnchor && + _nullabilityArray[reversalStartState.DfaStateId] > 0 && + FullNullabilityHandler.IsNullableAt( + this, in reversalStartState, FullInputReader.GetPositionId(this, input, i), + DfaStateHandler.GetStateFlags(this, in reversalStartState))) + { + initialLastStart = i; + } + } + + matchStart = matchEnd < startat ? startat : (_containsEndZAnchor, _containsAnyAnchor) switch { - (true, true) => FindStartPosition(input, matchEnd, matchStartLowBoundary, perThreadData), - (true, false) => FindStartPosition(input, matchEnd, matchStartLowBoundary, perThreadData), - (false, true) => FindStartPosition(input, matchEnd, matchStartLowBoundary, perThreadData), - (false, false) => FindStartPosition(input, matchEnd, matchStartLowBoundary, perThreadData), + // Call FindStartPosition with generic method arguments based on the presence of anchors. This is purely an optimization; + // the (true, true) case is functionally complete whereas the (false, false) case is the most optimized. + (true, true) => FindStartPosition(reversalStartState, initialLastStart, input, i, startat, perThreadData), + (true, false) => FindStartPosition(reversalStartState, initialLastStart, input, i, startat, perThreadData), + (false, true) => FindStartPosition(reversalStartState, initialLastStart, input, i, startat, perThreadData), + (false, false) => FindStartPosition(reversalStartState, initialLastStart, input, i, startat, perThreadData), }; + break; + + case MatchReversalKind.FixedLength: + // The whole match is known to be of a fixed length, so we don't need to do any processing to find its beginning, just jump there. + matchStart = matchEnd - _optimizedReversalInfo.FixedLength; + break; + + default: + Debug.Fail($"Unexpected reversal kind: {_optimizedReversalInfo.Kind}"); + break; } // Phase 3: @@ -403,30 +476,91 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan input, i } else { - Registers endRegisters = _pattern._info.ContainsLineAnchor ? + Registers endRegisters = _containsAnyAnchor ? FindSubcaptures(input, matchStart, matchEnd, perThreadData) : FindSubcaptures(input, matchStart, matchEnd, perThreadData); return new SymbolicMatch(matchStart, matchEnd - matchStart, endRegisters.CaptureStarts, endRegisters.CaptureEnds); } } + /// + /// Streamlined version of that doesn't handle /z anchors or very large sets of minterms. + /// + private int FindEndPositionOptimized( + ReadOnlySpan input, int pos, long timeoutOccursAt, RegexRunnerMode mode, PerThreadData perThreadData) + where TAcceleratedStateHandler : struct, IAcceleratedStateHandler + where TOptimizedNullabilityHandler : struct, IOptimizedNullabilityHandler + { + // Initial state candidate. (This is not used in the common DFA caseand could potentially be removed in the future.) + int initialStatePosCandidate = pos; + var currentState = new CurrentState(_dotstarredInitialStates[GetCharKind(input, pos - 1)]); + int endPos = NoMatchExists; + int lengthMinus1 = input.Length - 1; + + while (true) + { + int innerLoopLength; + bool done; + if (currentState.NfaState is null) + { + const int DfaCharsPerTimeoutCheck = 100_000; + innerLoopLength = _checkTimeout && lengthMinus1 - pos > DfaCharsPerTimeoutCheck ? pos + DfaCharsPerTimeoutCheck : lengthMinus1; + done = FindEndPositionDeltasDFAOptimized( + input, innerLoopLength, mode, timeoutOccursAt, ref pos, + ref currentState.DfaStateId, ref endPos); + } + else + { + // NFA fallback check, assume \Z and full nullability for NFA since it's already extremely rare to get here and it's not worth special-casing. + const int NfaCharsPerTimeoutCheck = 1_000; + innerLoopLength = _checkTimeout && input.Length - pos > NfaCharsPerTimeoutCheck ? pos + NfaCharsPerTimeoutCheck : input.Length; + done = FindEndPositionDeltasNFA( + input, innerLoopLength, mode, timeoutOccursAt, ref pos, + ref currentState, ref endPos, ref initialStatePosCandidate, ref initialStatePosCandidate); + } + + // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or + // there is no more input available, then the whole search is done. + if (done || pos >= input.Length) + { + break; + } + + // The search did not finish, so we either failed to transition (which should only happen if we were in DFA mode and + // need to switch over to NFA mode) or ran out of input in the inner loop. Check if the inner loop still had more + // input available. + if (pos < innerLoopLength) + { + // Because there was still more input available, a failure to transition in DFA mode must be the cause + // of the early exit. Upgrade to NFA mode. + NfaMatchingState nfaState = perThreadData.NfaState; + nfaState.InitializeFrom(this, GetState(currentState.DfaStateId)); + currentState = new CurrentState(nfaState); + } + + // Check for a timeout before continuing. + if (_checkTimeout) + { + CheckTimeout(timeoutOccursAt); + } + } + return endPos; + } + /// Performs the initial Phase 1 match to find the end position of the match, or first final state if this is an isMatch call. /// The input text. /// The starting position in . /// The time at which timeout occurs, if timeouts are being checked. /// The mode of execution based on the regex operation being performed. - /// The last position the initial state of was visited before the end position was found. - /// Length of the match if there's a match; otherwise, -1. /// Per thread data reused between calls. /// /// A one-past-the-end index into input for the preferred match, or first final state position if isMatch is true, or NoMatchExists if no match exists. /// - private int FindEndPosition(ReadOnlySpan input, int pos, long timeoutOccursAt, RegexRunnerMode mode, out int initialStatePos, out int matchLength, PerThreadData perThreadData) + private int FindEndPositionFallback(ReadOnlySpan input, int pos, long timeoutOccursAt, RegexRunnerMode mode, PerThreadData perThreadData) where TInputReader : struct, IInputReader where TFindOptimizationsHandler : struct, IInitialStateHandler where TNullabilityHandler : struct, INullabilityHandler { - initialStatePos = pos; int initialStatePosCandidate = pos; var currentState = new CurrentState(_dotstarredInitialStates[GetCharKind(input, pos - 1)]); @@ -442,14 +576,24 @@ private int FindEndPosition CharsPerTimeoutCheck ? - pos + CharsPerTimeoutCheck : - input.Length; - - bool done = currentState.NfaState is not null ? - FindEndPositionDeltas(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate) : - FindEndPositionDeltas(input, innerLoopLength, mode, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePos, ref initialStatePosCandidate); + // The fallback function has lower limits due to worse performance from edge cases + int innerLoopLength; + bool done; + if (currentState.NfaState is null) + { + const int DfaCharsPerTimeoutCheck = 25_000; + innerLoopLength = _checkTimeout && input.Length - pos > DfaCharsPerTimeoutCheck ? pos + DfaCharsPerTimeoutCheck : input.Length; + done = FindEndPositionDeltasDFA( + input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePosCandidate); + } + else + { + // NFA fallback check, assume \Z and full nullability for NFA since it's already extremely rare to get here. + const int NfaCharsPerTimeoutCheck = 1_000; + innerLoopLength = _checkTimeout && input.Length - pos > NfaCharsPerTimeoutCheck ? pos + NfaCharsPerTimeoutCheck : input.Length; + done = FindEndPositionDeltasNFA( + input, innerLoopLength, mode, timeoutOccursAt, ref pos, ref currentState, ref endPos, ref endStateId, ref initialStatePosCandidate); + } // If the inner loop indicates that the search finished (for example due to reaching a deadend state) or // there is no more input available, then the whole search is done. @@ -476,15 +620,119 @@ private int FindEndPosition 0 ? GetState(endStateId).FixedLength(GetCharKind(input, endPos)) : -1; return endPos; } + + /// + /// This version of uses a different set of interfaces, + /// which don't check for many inner loop edge cases, e.g. input end or '\n'. + /// All edge cases are handled before entering the loop. + /// + private bool FindEndPositionDeltasDFAOptimized( + ReadOnlySpan input, int lengthMinus1, RegexRunnerMode mode, + long timeoutOccursAt, ref int posRef, ref int currentStateIdRef, ref int endPosRef) + where TAcceleratedStateHandler : struct, IAcceleratedStateHandler + where TOptimizedNullabilityHandler : struct, IOptimizedNullabilityHandler + { + // Initial check for input end lifted out of the subsequent hot-path loop. + if (posRef == input.Length) + { + if (_stateArray[currentStateIdRef]!.IsNullableFor(_positionKinds[0])) + { + // the end position kind was nullable + endPosRef = posRef; + } + return true; + } + + // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning. + int pos = posRef; + int currStateId = currentStateIdRef; + int endPos = endPosRef; + + byte[] mtlookup = _mintermClassifier.ByteLookup!; + int deadStateId = _deadStateId; + int initialStateId = _initialStateId; + try + { + // The goal is to make this loop as fast as it can possibly be, + // every single piece of overhead should be removed here + while (true) + { + if (currStateId == deadStateId) + { + return true; + } + + if (TAcceleratedStateHandler.TryFindNextStartingPosition(this, input, mtlookup, ref currStateId, ref pos, initialStateId)) + { + if (pos == input.Length) + { + // patterns such as ^$ can be nullable right away + if (_stateArray[currStateId]!.IsNullableFor(_positionKinds[0])) + { + // the end position kind was nullable + endPos = pos; + } + + currStateId = deadStateId; + continue; + } + } + + // If the state is nullable for the next character, we found a potential end state. + if (TOptimizedNullabilityHandler.IsNullable(this, _nullabilityArray, currStateId, mtlookup, input, pos)) + { + endPos = pos; + + // A match is known to exist. If that's all we need to know, we're done. + if (mode == RegexRunnerMode.ExistenceRequired) + { + return true; + } + } + + // If there is more input available try to transition with the next character. + // Note: the order here is important so the transition itself gets taken + if (!DfaStateHandler.TryTakeDFATransition(this, ref currStateId, GetMintermId(mtlookup, input, pos), timeoutOccursAt) || + pos >= lengthMinus1) + { + if (pos + 1 < input.Length) + { + return false; + } + pos++; + + // one off check for the final position + // this is just to move it out of the hot loop + if (!(_stateFlagsArray[currStateId].IsNullable() || + _stateArray[currStateId]!.IsNullableFor(GetPositionKind(-1)))) + { + return true; + + } + + // the end position (-1) was nullable + endPos = pos; + return true; + } + + // We successfully transitioned, so update our current input index to match. + pos++; + } + } + finally + { + // Write back the local copies of the ref values. + posRef = pos; + endPosRef = endPos; + currentStateIdRef = currStateId; + } + } + /// - /// Workhorse inner loop for . Consumes the character by character, + /// Workhorse inner loop for . Consumes the character by character, /// starting at , for each character transitioning from one state in the DFA or NFA graph to the next state, /// lazily building out the graph as needed. /// @@ -500,8 +748,8 @@ private int FindEndPosition - private bool FindEndPositionDeltas(ReadOnlySpan input, int length, RegexRunnerMode mode, - ref int posRef, ref CurrentState state, ref int endPosRef, ref int endStateIdRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef) + private bool FindEndPositionDeltasDFA(ReadOnlySpan input, int length, RegexRunnerMode mode, + long timeoutOccursAt, ref int posRef, ref CurrentState state, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef) where TStateHandler : struct, IStateHandler where TInputReader : struct, IInputReader where TFindOptimizationsHandler : struct, IInitialStateHandler @@ -510,30 +758,108 @@ private bool FindEndPositionDeltas(this, input, ref state, ref pos)) { return true; } - initialStatePosCandidate = pos; } - // If the state is a dead end, such that we can't transition anywhere else, end the search. - if (flags.IsDeadend()) + int positionId = TInputReader.GetPositionId(this, input, pos); + + // If the state is nullable for the next character, meaning it accepts the empty string, + // we found a potential end state. + if (TNullabilityHandler.IsNullableAt(this, in state, positionId, TStateHandler.GetStateFlags(this, in state))) + { + endPos = pos; + + // endStateId = TStateHandler.ExtractNullableCoreStateId(this, in state, input, pos); + initialStatePos = initialStatePosCandidate; + + // A match is known to exist. If that's all we need to know, we're done. + if (mode == RegexRunnerMode.ExistenceRequired) + { + return true; + } + } + + // If there is more input available try to transition with the next character. + if (pos >= length || !TStateHandler.TryTakeTransition(this, ref state, positionId, timeoutOccursAt)) + { + return false; + } + + // We successfully transitioned, so update our current input index to match. + pos++; + } + } + finally + { + // Write back the local copies of the ref values. + posRef = pos; + endPosRef = endPos; + initialStatePosRef = initialStatePos; + initialStatePosCandidateRef = initialStatePosCandidate; + } + } + + /// + /// Workhorse inner loop for . Consumes the character by character, + /// starting at , for each character transitioning from one state in the DFA or NFA graph to the next state, + /// lazily building out the graph as needed. + /// + /// + /// The supplies the actual transitioning logic, controlling whether processing is + /// performed in DFA mode or in NFA mode. However, it expects to be configured to match, + /// so for example if is a , it expects the 's + /// to be non-negative and its to be null; vice versa for + /// . + /// + /// + /// A positive value if iteration completed because it reached a deadend state or nullable state and the call is an isMatch. + /// 0 if iteration completed because we reached an initial state. + /// A negative value if iteration completed because we ran out of input or we failed to transition. + /// + private bool FindEndPositionDeltasNFA( + ReadOnlySpan input, int length, RegexRunnerMode mode, long timeoutOccursAt, + ref int posRef, ref CurrentState state, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef) + where TStateHandler : struct, IStateHandler + where TInputReader : struct, IInputReader + where TFindOptimizationsHandler : struct, IInitialStateHandler + where TNullabilityHandler : struct, INullabilityHandler + { + // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning. + int pos = posRef; + int endPos = endPosRef; + int initialStatePos = initialStatePosRef; + int initialStatePosCandidate = initialStatePosCandidateRef; + try + { + // Loop through each character in the input, transitioning from state to state for each. + while (true) + { + StateFlags flags = TStateHandler.GetStateFlags(this, in state); + + // Dead end here means the set is empty + if (state.NfaState!.NfaStateSet.Count == 0) { return true; } @@ -545,7 +871,6 @@ private bool FindEndPositionDeltas(this, in state, positionId, flags)) { endPos = pos; - endStateId = TStateHandler.ExtractNullableCoreStateId(this, in state, input, pos); initialStatePos = initialStatePosCandidate; // A match is known to exist. If that's all we need to know, we're done. @@ -556,7 +881,7 @@ private bool FindEndPositionDeltas= length || !TStateHandler.TryTakeTransition(this, ref state, positionId)) + if (pos >= length || !TStateHandler.TryTakeTransition(this, ref state, positionId, timeoutOccursAt)) { return false; } @@ -570,7 +895,6 @@ private bool FindEndPositionDeltas. /// + /// State to start reversal from + /// Either valid match start location or -1 /// The input text. /// The ending position to walk backwards from. points one past the last character of the match. /// The initial starting location discovered in phase 1, a point we must not walk earlier than. /// Per thread data reused between calls. /// The found starting position for the match. - private int FindStartPosition(ReadOnlySpan input, int i, int matchStartBoundary, PerThreadData perThreadData) + private int FindStartPosition(CurrentState startState, int initialLastStart, ReadOnlySpan input, int i, int matchStartBoundary, PerThreadData perThreadData) where TInputReader : struct, IInputReader where TNullabilityHandler : struct, INullabilityHandler { Debug.Assert(i >= 0, $"{nameof(i)} == {i}"); Debug.Assert(matchStartBoundary >= 0 && matchStartBoundary <= input.Length, $"{nameof(matchStartBoundary)} == {matchStartBoundary}"); Debug.Assert(i >= matchStartBoundary, $"Expected {i} >= {matchStartBoundary}."); - - // Get the starting state for the reverse pattern. This depends on previous character (which, because we're - // going backwards, is character number i). - var currentState = new CurrentState(_reverseInitialStates[GetCharKind(input, i)]); - - int lastStart = -1; // invalid sentinel value + CurrentState currentState = startState; + int lastStart = initialLastStart; // Walk backwards to the furthest accepting state of the reverse pattern but no earlier than matchStartBoundary. while (true) { // Run the DFA or NFA traversal backwards from the current point using the current state. bool done = currentState.NfaState is not null ? - FindStartPositionDeltas(input, ref i, matchStartBoundary, ref currentState, ref lastStart) : - FindStartPositionDeltas(input, ref i, matchStartBoundary, ref currentState, ref lastStart); + FindStartPositionDeltasNFA(input, ref i, matchStartBoundary, ref currentState, ref lastStart) : + FindStartPositionDeltasDFA(input, ref i, matchStartBoundary, ref currentState, ref lastStart); // If we found the starting position, we're done. if (done) @@ -635,7 +957,8 @@ private int FindStartPosition(ReadOnlySpan, for each character transitioning from one state in the DFA or NFA graph to the next state, /// lazily building out the graph as needed. /// - private bool FindStartPositionDeltas(ReadOnlySpan input, ref int i, int startThreshold, ref CurrentState state, ref int lastStart) + private bool FindStartPositionDeltasDFA( + ReadOnlySpan input, ref int i, int startThreshold, ref CurrentState state, ref int lastStart) where TStateHandler : struct, IStateHandler where TInputReader : struct, IInputReader where TNullabilityHandler : struct, INullabilityHandler @@ -647,27 +970,73 @@ private bool FindStartPositionDeltas 0 && + TNullabilityHandler.IsNullableAt(this, in state, positionId, TStateHandler.GetStateFlags(this, in state))) + { + lastStart = pos; + } + // If we are past the start threshold or if the state is a dead end, bail; we should have already + // found a valid starting location. + if (pos <= startThreshold || state.DfaStateId == _deadStateId) + { + Debug.Assert(lastStart != -1); + return true; + } + + // Try to transition with the next character, the one before the current position. + if (!TStateHandler.TryTakeTransition(this, ref state, positionId, 0)) + { + // Return false to indicate the search didn't finish. + return false; + } + + // Since we successfully transitioned, update our current index to match the fact that we consumed the previous character in the input. + pos--; + } + } + finally + { + // Write back the local copies of the ref values. + i = pos; + } + } + + private bool FindStartPositionDeltasNFA(ReadOnlySpan input, ref int i, int startThreshold, ref CurrentState state, ref int lastStart) + where TStateHandler : struct, IStateHandler + where TInputReader : struct, IInputReader + where TNullabilityHandler : struct, INullabilityHandler + { + // To avoid frequent reads/writes to ref values, make and operate on local copies, which we then copy back once before returning. + int pos = i; + try + { + // Loop backwards through each character in the input, transitioning from state to state for each. + while (true) + { int positionId = TInputReader.GetPositionId(this, input, pos - 1); // If the state accepts the empty string, we found a valid starting position. Record it and keep going, // since we're looking for the earliest one to occur within bounds. - if (TNullabilityHandler.IsNullableAt(this, in state, positionId, flags)) + if (TNullabilityHandler.IsNullableAt(this, in state, positionId, TStateHandler.GetStateFlags(this, in state))) { lastStart = pos; } // If we are past the start threshold or if the state is a dead end, bail; we should have already // found a valid starting location. - if (pos <= startThreshold || flags.IsDeadend()) + if (pos <= startThreshold || state.DfaStateId == _deadStateId) { Debug.Assert(lastStart != -1); return true; } // Try to transition with the next character, the one before the current position. - if (!TStateHandler.TryTakeTransition(this, ref state, positionId)) + if (!TStateHandler.TryTakeTransition(this, ref state, positionId, 0)) { // Return false to indicate the search didn't finish. return false; @@ -746,7 +1115,7 @@ private Registers FindSubcaptures(ReadOnlySpan input, int i, int coreStateId = GetCoreStateId(targetStateId); StateFlags flags = _stateFlagsArray[coreStateId]; - Debug.Assert(!flags.IsDeadend()); + Debug.Assert(coreStateId != _deadStateId); if (flags.IsNullable() || (flags.CanBeNullable() && GetState(coreStateId).IsNullableFor(GetCharKind(input, i + 1)))) { @@ -768,7 +1137,7 @@ private Registers FindSubcaptures(ReadOnlySpan input, int i, } Debug.Assert(current.Count > 0); - foreach (var (endStateId, endRegisters) in current.Values) + foreach ((int endStateId, Registers endRegisters) in current.Values) { MatchingState endState = GetState(GetCoreStateId(endStateId)); if (endState.IsNullableFor(GetCharKind(input, iEnd))) @@ -784,6 +1153,16 @@ private Registers FindSubcaptures(ReadOnlySpan input, int i, return default; } + /// Look up the min term ID for the character at the specified position in the input. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static int GetMintermId(byte[] mintermLookup, ReadOnlySpan input, int pos) + { + Debug.Assert(pos >= 0 && pos < input.Length); + + char c = input[pos]; + return c < (uint)mintermLookup.Length ? mintermLookup[c] : 0; + } + /// Stores additional data for tracking capture start and end positions. /// The NFA simulation based third phase has one of these for each current state in the current set of live states. internal struct Registers(int[] captureStarts, int[] captureEnds) @@ -938,7 +1317,7 @@ private interface IStateHandler public static abstract bool IsNullableFor(SymbolicRegexMatcher matcher, in CurrentState state, uint nextCharKind); public static abstract int ExtractNullableCoreStateId(SymbolicRegexMatcher matcher, in CurrentState state, ReadOnlySpan input, int pos); public static abstract int FixedLength(SymbolicRegexMatcher matcher, in CurrentState state, uint nextCharKind); - public static abstract bool TryTakeTransition(SymbolicRegexMatcher matcher, ref CurrentState state, int mintermId); + public static abstract bool TryTakeTransition(SymbolicRegexMatcher matcher, ref CurrentState state, int mintermId, long timeoutOccursAt); public static abstract StateFlags GetStateFlags(SymbolicRegexMatcher matcher, in CurrentState state); } @@ -949,7 +1328,8 @@ private interface IStateHandler public static bool StartsWithLineAnchor(SymbolicRegexMatcher matcher, in CurrentState state) => matcher.GetState(state.DfaStateId).StartsWithLineAnchor; [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool IsNullableFor(SymbolicRegexMatcher matcher, in CurrentState state, uint nextCharKind) => matcher.GetState(state.DfaStateId).IsNullableFor(nextCharKind); + public static bool IsNullableFor(SymbolicRegexMatcher matcher, in CurrentState state, + uint nextCharKind) => matcher._nullabilityArray[state.DfaStateId] > 0 && ((byte)(1 << (int)nextCharKind) & matcher._nullabilityArray[state.DfaStateId]) > 0; /// Gets the preferred DFA state for nullability. In DFA mode this is just the state itself. [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -961,7 +1341,8 @@ private interface IStateHandler /// Take the transition to the next DFA state. [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool TryTakeTransition(SymbolicRegexMatcher matcher, ref CurrentState state, int mintermId) + public static bool TryTakeTransition(SymbolicRegexMatcher matcher, ref CurrentState state, int mintermId, + long timeoutOccursAt) { Debug.Assert(state.DfaStateId > 0, $"Expected non-zero {nameof(state.DfaStateId)}."); Debug.Assert(state.NfaState is null, $"Expected null {nameof(state.NfaState)}."); @@ -990,6 +1371,38 @@ public static bool TryTakeTransition(SymbolicRegexMatcher matcher, ref Cur return false; } + /// Transition function that only considers DFA state id + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static bool TryTakeDFATransition(SymbolicRegexMatcher matcher, ref int state, + int mintermId, long timeoutOccursAt) + { + Debug.Assert(state > 0, $"Expected {nameof(state)} {state} > 0"); + + // Use the mintermId for the character being read to look up which state to transition to. + // If that state has already been materialized, move to it, and we're done. If that state + // hasn't been materialized, try to create it; if we can, move to it, and we're done. + int nextStateId = matcher._dfaDelta[matcher.DeltaOffset(state, mintermId)]; + if (nextStateId > 0) + { + // There was an existing DFA transition to some state. Move to it and + // return that we're still operating as a DFA and can keep going. + state = nextStateId; + return true; + } + + if (matcher.TryCreateNewTransition(matcher.GetState(state), mintermId, + matcher.DeltaOffset(state, mintermId), + checkThreshold: true, out MatchingState? nextState, timeoutOccursAt)) + { + // We were able to create a new DFA transition to some state. Move to it and + // return that we're still operating as a DFA and can keep going. + state = nextState.Id; + return true; + } + + return false; + } + /// /// Gets context independent state information: /// - whether this is an initial state @@ -998,8 +1411,8 @@ public static bool TryTakeTransition(SymbolicRegexMatcher matcher, ref Cur /// - whether this state may be contextually nullable /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static StateFlags GetStateFlags(SymbolicRegexMatcher matcher, in CurrentState state) - => matcher._stateFlagsArray[state.DfaStateId]; + public static StateFlags GetStateFlags(SymbolicRegexMatcher matcher, in CurrentState state) => + matcher._stateFlagsArray[state.DfaStateId]; } /// An for operating over instances configured as NFA states. @@ -1067,7 +1480,8 @@ public static int FixedLength(SymbolicRegexMatcher matcher, in CurrentStat } /// Take the transition to the next NFA state. - public static bool TryTakeTransition(SymbolicRegexMatcher matcher, ref CurrentState state, int mintermId) + public static bool TryTakeTransition(SymbolicRegexMatcher matcher, ref CurrentState state, int mintermId, + long timeoutOccursAt = 0) { Debug.Assert(state.DfaStateId < 0, $"Expected negative {nameof(state.DfaStateId)}."); Debug.Assert(state.NfaState is not null, $"Expected non-null {nameof(state.NfaState)}."); @@ -1149,25 +1563,17 @@ static int[] GetNextStates(int sourceState, int mintermId, SymbolicRegexMatcher< [MethodImpl(MethodImplOptions.AggressiveInlining)] public static StateFlags GetStateFlags(SymbolicRegexMatcher matcher, in CurrentState state) { - SparseIntMap stateSet = state.NfaState!.NfaStateSet; - if (stateSet.Count == 0) - { - // In NFA state sets dead ends are never included. Instead an empty set of states represents a dead end. - return StateFlags.IsDeadendFlag; - } - else + // Build the flags for the set of states by taking a bitwise Or of all the per-state flags and then + // masking out the irrelevant ones. This works because IsNullable and CanBeNullable should be true if + // they are true for any state in the set; SimulatesBacktracking is true for all the states if + // it is true for any state (since it is a phase-wide property); and all other flags are masked out. + StateFlags flags = 0; + foreach (ref KeyValuePair nfaState in CollectionsMarshal.AsSpan(state.NfaState!.NfaStateSet.Values)) { - // Build the flags for the set of states by taking a bitwise Or of all the per-state flags and then - // masking out the irrelevant ones. This works because IsNullable and CanBeNullable should be true if - // they are true for any state in the set; SimulatesBacktracking is true for all the states if - // it is true for any state (since it is a phase-wide property); and all other flags are masked out. - StateFlags flags = 0; - foreach (ref KeyValuePair nfaState in CollectionsMarshal.AsSpan(stateSet.Values)) - { - flags |= matcher._stateFlagsArray[matcher.GetCoreStateId(nfaState.Key)]; - } - return flags & (StateFlags.IsNullableFlag | StateFlags.CanBeNullableFlag | StateFlags.SimulatesBacktrackingFlag); + flags |= matcher._stateFlagsArray[matcher.GetCoreStateId(nfaState.Key)]; } + + return flags & (StateFlags.IsNullableFlag | StateFlags.CanBeNullableFlag | StateFlags.SimulatesBacktrackingFlag); } #if DEBUG @@ -1207,7 +1613,7 @@ private interface IInputReader private readonly struct NoZAnchorInputReader : IInputReader { public static int GetPositionId(SymbolicRegexMatcher matcher, ReadOnlySpan input, int pos) => - (uint)pos >= (uint)input.Length ? -1 : matcher._mintermClassifier.GetMintermID(input[pos]); + (uint)pos < (uint)input.Length ? matcher._mintermClassifier.GetMintermID(input[pos]) : -1; } /// This reader includes full handling of an \n as the last character of input for the \Z anchor. @@ -1215,41 +1621,103 @@ public static int GetPositionId(SymbolicRegexMatcher matcher, ReadOnlySpan { public static int GetPositionId(SymbolicRegexMatcher matcher, ReadOnlySpan input, int pos) { - if ((uint)pos >= (uint)input.Length) - return -1; - - int c = input[pos]; + if ((uint)pos < (uint)input.Length) + { + // Find the minterm, handling the special case for the last \n for states that start with a relevant anchor + int c = input[pos]; + return c == '\n' && pos == input.Length - 1 ? + matcher._minterms.Length : // mintermId = minterms.Length represents an \n at the very end of input + matcher._mintermClassifier.GetMintermID(c); + } - // Find the minterm, handling the special case for the last \n for states that start with a relevant anchor - return c == '\n' && pos == input.Length - 1 ? - matcher._minterms.Length : // mintermId = minterms.Length represents an \n at the very end of input - matcher._mintermClassifier.GetMintermID(c); + return -1; } } - /// - /// Interface for optimizations to accelerate search from initial states. - /// private interface IInitialStateHandler { - public static abstract bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, ReadOnlySpan input, ref CurrentState state, ref int pos) + public static abstract bool TryFindNextStartingPosition( + SymbolicRegexMatcher matcher, ReadOnlySpan input, + ref CurrentState state, ref int pos) where TInputReader : struct, IInputReader; } /// - /// No-op handler for when there are no initial state optimizations to apply. + /// Interface for accelerated states, returns true if position was changed /// - private readonly struct NoOptimizationsInitialStateHandler : IInitialStateHandler + private interface IAcceleratedStateHandler + { + public static abstract bool TryFindNextStartingPosition( + SymbolicRegexMatcher matcher, ReadOnlySpan input, + byte[] lookup, ref int currentStateId, ref int pos, int initialStateId); + } + + private readonly struct NoAnchorAcceleratedStateHandler : IAcceleratedStateHandler { [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, ReadOnlySpan input, ref CurrentState state, ref int pos) - where TInputReader : struct, IInputReader + public static bool TryFindNextStartingPosition( + SymbolicRegexMatcher matcher, ReadOnlySpan input, byte[] lookup, ref int currentStateId, ref int pos, int initialStateId) { - // return true to indicate that the current position is a possible starting position + if (currentStateId != initialStateId) + { + return false; + } + + if (!matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) + { + // No match exists + currentStateId = matcher._deadStateId; + pos = input.Length; + } + return true; } } + private readonly struct AcceleratedStateHandler : IAcceleratedStateHandler + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, + ReadOnlySpan input, + byte[] lookup, ref int currentStateId, ref int pos, int initialStateId) + { + if (currentStateId != initialStateId) + { + return false; + } + + if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) + { + currentStateId = matcher._dotstarredInitialStates[matcher._positionKinds[GetMintermId(lookup, input, pos - 1) + 1]].Id; + } + else + { + // No match exists + currentStateId = matcher._deadStateId; + pos = input.Length; + } + + return true; + } + } + + private readonly struct NoAcceleratedStateHandler : IAcceleratedStateHandler + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static bool TryFindNextStartingPosition( + SymbolicRegexMatcher matcher, ReadOnlySpan input, byte[] lookup, ref int currentStateId, ref int pos, int initialStateId) => + false; + } + + /// No-op handler for when there are no initial state optimizations to apply. + private readonly struct NoOptimizationsInitialStateHandler : IInitialStateHandler + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static bool TryFindNextStartingPosition(SymbolicRegexMatcher matcher, ReadOnlySpan input, ref CurrentState state, ref int pos) + where TInputReader : struct, IInputReader => + true; // the current position is a possible starting position + } + /// /// Handler for when a instance is available. /// @@ -1260,26 +1728,33 @@ public static bool TryFindNextStartingPosition(SymbolicRegexMatche where TInputReader : struct, IInputReader { // Find the first position that matches with some likely character. - if (!matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) + if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) { - // No match exists - return false; + // Update the starting state based on where TryFindNextStartingPosition moved us to. + // As with the initial starting state, if it's a dead end, no match exists. + state = new CurrentState(matcher._dotstarredInitialStates[matcher.GetCharKind(input, pos - 1)]); + return true; } - // Update the starting state based on where TryFindNextStartingPosition moved us to. - // As with the initial starting state, if it's a dead end, no match exists. - state = new CurrentState(matcher._dotstarredInitialStates[matcher.GetCharKind(input, pos - 1)]); - return true; + // No match exists + return false; } } - /// - /// Interface for evaluating nullability of states. - /// + /// Interface for evaluating nullability of states. private interface INullabilityHandler { - public static abstract bool IsNullableAt(SymbolicRegexMatcher matcher, in CurrentState state, int positionId, StateFlags flags) - where TStateHandler : struct, IStateHandler; + public static abstract bool IsNullableAt( + SymbolicRegexMatcher matcher, in CurrentState state, int positionId, StateFlags flags) + where TStateHandler : struct, IStateHandler; + } + + /// This nullability handler interface can be used in DFAs for patterns that do not contain \Z. + private interface IOptimizedNullabilityHandler + { + public static abstract bool IsNullable( + SymbolicRegexMatcher matcher, byte[] nullabilityArray, int currStateId, + byte[] lookup, ReadOnlySpan input, int pos); } /// @@ -1303,9 +1778,37 @@ public static bool IsNullableAt(SymbolicRegexMatcher matche { [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool IsNullableAt(SymbolicRegexMatcher matcher, in CurrentState state, int positionId, StateFlags flags) - where TStateHandler : struct, IStateHandler + where TStateHandler : struct, IStateHandler => + flags.IsNullable() || + (flags.CanBeNullable() && TStateHandler.IsNullableFor(matcher, in state, matcher.GetPositionKind(positionId))); + } + + private readonly struct NoAnchorOptimizedNullabilityHandler : IOptimizedNullabilityHandler + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static bool IsNullable(SymbolicRegexMatcher matcher, byte[] nullabilityArray, int currStateId, byte[] lookup, ReadOnlySpan input, int pos) { - return flags.IsNullable() || (flags.CanBeNullable() && TStateHandler.IsNullableFor(matcher, in state, matcher.GetPositionKind(positionId))); + Debug.Assert(pos >= 0 && pos < input.Length, "input end should not be handled here"); + Debug.Assert(currStateId < nullabilityArray.Length, "nullabilityArray grown but the reference is not up to date"); + return nullabilityArray[currStateId] > 0; + } + } + + private readonly struct AnchorOptimizedNullabilityHandler : IOptimizedNullabilityHandler + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static bool IsNullable(SymbolicRegexMatcher matcher, byte[] nullabilityArray, int currStateId, byte[] lookup, ReadOnlySpan input, int pos) + { + Debug.Assert(pos >= 0 && pos < input.Length, "input end should not be handled here"); + Debug.Assert(currStateId < nullabilityArray.Length, "nullabilityArray grown but the reference is not up to date"); + + if (nullabilityArray[currStateId] > 0) + { + char c = input[pos]; + return matcher.IsNullableWithContext(currStateId, c < (uint)lookup.Length ? lookup[c] : 0); + } + + return false; } } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs index a138c819be00fa..5384810092b7fc 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs @@ -185,7 +185,7 @@ internal bool CanBeNullable public List> ToList(List>? list = null, SymbolicRegexNodeKind listKind = SymbolicRegexNodeKind.Concat) { Debug.Assert(listKind is SymbolicRegexNodeKind.Concat or SymbolicRegexNodeKind.Alternate); - list ??= new List>(); + list ??= []; AppendToList(this, list, listKind); return list; @@ -394,9 +394,11 @@ SymbolicRegexNodeKind.BoundaryAnchor or SymbolicRegexNodeKind.NonBoundaryAnchor SymbolicRegexNodeKind.BeginningAnchor or SymbolicRegexNodeKind.EndAnchor or SymbolicRegexNodeKind.EndAnchorZ or SymbolicRegexNodeKind.EndAnchorZReverse or SymbolicRegexNodeKind.EOLAnchor or SymbolicRegexNodeKind.BOLAnchor); - return Create(builder, kind, null, null, -1, -1, default, SymbolicRegexInfo.Anchor(isLineAnchor: kind is - SymbolicRegexNodeKind.EndAnchorZ or SymbolicRegexNodeKind.EndAnchorZReverse or - SymbolicRegexNodeKind.EOLAnchor or SymbolicRegexNodeKind.BOLAnchor)); + return Create( + builder, kind, null, null, -1, -1, default, + SymbolicRegexInfo.Anchor( + isLineAnchor: kind is SymbolicRegexNodeKind.EndAnchorZ or SymbolicRegexNodeKind.EndAnchorZReverse or SymbolicRegexNodeKind.EOLAnchor or SymbolicRegexNodeKind.BOLAnchor, + isEndZAnchor: kind is SymbolicRegexNodeKind.EndAnchorZ)); } #endregion @@ -540,8 +542,8 @@ internal static SymbolicRegexNode CreateAlternate(SymbolicRegexBuilder> seenElems = new(); // Keep track of if any elements from the right side need to be eliminated + HashSet> seenElems = []; bool rightChanged = false; for (int i = 0; i < elems.Count; i++) { @@ -835,7 +837,7 @@ private static bool TryFoldAlternation(SymbolicRegexBuilder builder, Symbo static bool TrySplitConcatSubsumption(SymbolicRegexBuilder builder, SymbolicRegexNode left, SymbolicRegexNode right, [NotNullWhen(true)] out SymbolicRegexNode? prefix) { - List> prefixElements = new(); + List> prefixElements = []; SymbolicRegexNode suffix = right; while (suffix._kind == SymbolicRegexNodeKind.Concat) { @@ -1051,7 +1053,7 @@ public SymbolicRegexNode AddFixedLengthMarkers(SymbolicRegexBuilder /// the derivative internal List<(SymbolicRegexNode, DerivativeEffect[])> CreateNfaDerivativeWithEffects(SymbolicRegexBuilder builder, TSet elem, uint context) { - List<(SymbolicRegexNode, DerivativeEffect[])> transitions = new(); + List<(SymbolicRegexNode, DerivativeEffect[])> transitions = []; CreateDerivativeWrapper(builder, elem, context).StripAndMapEffects(builder, context, transitions); return transitions; } @@ -1084,9 +1086,8 @@ private SymbolicRegexNode PruneLowerPriorityThanNullability(SymbolicRegexB return this; // Cache result to avoid otherwise potential quadratic worst case behavior - SymbolicRegexNode? prunedNode; (SymbolicRegexNode, uint) key = (this, context); - if (builder._pruneLowerPriorityThanNullabilityCache.TryGetValue(key, out prunedNode)) + if (builder._pruneLowerPriorityThanNullabilityCache.TryGetValue(key, out SymbolicRegexNode? prunedNode)) { return prunedNode; } @@ -1253,9 +1254,8 @@ private SymbolicRegexNode CreateDerivative(SymbolicRegexBuilder buil return StackHelper.CallOnEmptyStack(CreateDerivative, builder, elem, context); } - SymbolicRegexNode? derivative; (SymbolicRegexNode, TSet, uint) key = (this, elem, context); - if (builder._derivativeCache.TryGetValue(key, out derivative)) + if (builder._derivativeCache.TryGetValue(key, out SymbolicRegexNode? derivative)) { return derivative; } @@ -1433,7 +1433,7 @@ internal void StripAndMapEffects(SymbolicRegexBuilder builder, uint contex return; } - currentEffects ??= new List(); + currentEffects ??= []; // If we've reached a node with no effects, then output that with the effects that have been accumulated if (!_info.ContainsEffect) @@ -1468,7 +1468,7 @@ internal void StripAndMapEffects(SymbolicRegexBuilder builder, uint contex _left.StripAndMapEffects(builder, context, alternativesAndEffects, currentEffects); for (int i = oldAlternativesCount; i < alternativesAndEffects.Count; i++) { - var (node, effects) = alternativesAndEffects[i]; + (SymbolicRegexNode node, DerivativeEffect[] effects) = alternativesAndEffects[i]; alternativesAndEffects[i] = (builder.CreateConcat(node, _right), effects); } break; @@ -1506,7 +1506,7 @@ internal void StripAndMapEffects(SymbolicRegexBuilder builder, uint contex _left.StripAndMapEffects(builder, context, alternativesAndEffects, currentEffects); for (int i = oldAlternativesCount; i < alternativesAndEffects.Count; i++) { - var (node, effects) = alternativesAndEffects[i]; + (SymbolicRegexNode node, DerivativeEffect[] effects) = alternativesAndEffects[i]; alternativesAndEffects[i] = (builder.CreateDisableBacktrackingSimulation(node), effects); } break; @@ -1895,12 +1895,8 @@ private void CollectSets(SymbolicRegexBuilder builder, HashSet sets) } /// Compute and sort all the minterms from the sets in this regex. - public TSet[] ComputeMinterms(SymbolicRegexBuilder builder) - { - HashSet sets = GetSets(builder); - List minterms = MintermGenerator.GenerateMinterms(builder._solver, sets); - return minterms.ToArray(); - } + public TSet[] ComputeMinterms(SymbolicRegexBuilder builder) => + MintermGenerator.GenerateMinterms(builder._solver, GetSets(builder)).ToArray(); /// /// Create the reverse of this regex diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs index fea9518b79b512..aa6708a60d01a9 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs @@ -40,8 +40,8 @@ public SymbolicRegexRunnerFactory(RegexTree regexTree, RegexOptions options, Tim BDD[] minterms = rootNode.ComputeMinterms(bddBuilder); _matcher = minterms.Length > 64 ? - SymbolicRegexMatcher.Create(regexTree.CaptureCount, regexTree.FindOptimizations, bddBuilder, rootNode, new BitVectorSolver(minterms, charSetSolver), matchTimeout) : - SymbolicRegexMatcher.Create(regexTree.CaptureCount, regexTree.FindOptimizations, bddBuilder, rootNode, new UInt64Solver(minterms, charSetSolver), matchTimeout); + SymbolicRegexMatcher.Create(regexTree.CaptureCount, regexTree.FindOptimizations, bddBuilder, rootNode, new BitVectorSolver(minterms), matchTimeout) : + SymbolicRegexMatcher.Create(regexTree.CaptureCount, regexTree.FindOptimizations, bddBuilder, rootNode, new UInt64Solver(minterms), matchTimeout); } /// Creates a object. diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs index 6057827e1d53fd..5d73a3e232e809 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs @@ -8,33 +8,33 @@ namespace System.Text.RegularExpressions.Symbolic /// internal static class SymbolicRegexThresholds { - /// Maximum number of built states before switching over to NFA mode. + /// Maximum number of instances before switching over to NFA mode. /// /// By default, all matching starts out using DFAs, where every state transitions to one and only one /// state for any minterm (each character maps to one minterm). Some regular expressions, however, can result /// in really, really large DFA state graphs, much too big to actually store. Instead of failing when we /// encounter such state graphs, at some point we instead switch from processing as a DFA to processing as - /// an NFA. As an NFA, we instead track all of the states we're in at any given point, and transitioning - /// from one "state" to the next really means for every constituent state that composes our current "state", - /// we find all possible states that transitioning out of each of them could result in, and the union of - /// all of those is our new "state". This constant represents the size of the graph after which we start - /// processing as an NFA instead of as a DFA. This processing doesn't change immediately, however. All - /// processing starts out in DFA mode, even if we've previously triggered NFA mode for the same regex. - /// We switch over into NFA mode the first time a given traversal (match operation) results in us needing - /// to create a new node and the graph is already or newly beyond this threshold. + /// an NFA. As an NFA, we instead track all of the states we're in at any given point. /// - internal const int NfaThreshold = 10_000; + /// + /// This limit is chosen due to memory usage constraints, the largest possible memory allocation for a regex instance is currently ~50 MB. + /// Worst case memory consumption for the regex instance can be approximated to ~(NfaNodeCountThreshold * (sizeof(MatchingState) + sizeof(SymbolicRegexNode)) + /// while it most cases the MatchingState part can be ignored, as only a subset of nodes have their own state. + /// + internal const int NfaNodeCountThreshold = 125_000; /// /// Default maximum estimated safe expansion size of a AST - /// after the AST has been anlayzed for safe handling. + /// after the AST has been analyzed for safe handling. /// /// If the AST exceeds this threshold then is thrown. /// This default value may be overridden with the AppContext data /// whose name is given by . /// + /// This limit is chosen due to worst case NFA speed constraints, + /// although it could be safely raised higher at the expense of worst-case NFA performance. /// - internal const int DefaultSymbolicRegexSafeSizeThreshold = 1000; + internal const int DefaultSymbolicRegexSafeSizeThreshold = 10_000; ///The environment variable name for a value overriding the default value internal const string SymbolicRegexSafeSizeThreshold_ConfigKeyName = "REGEX_NONBACKTRACKING_MAX_AUTOMATA_SIZE"; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/UInt64Solver.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/UInt64Solver.cs index 7664d6d03aa4a7..c65c00fd23413a 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/UInt64Solver.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/UInt64Solver.cs @@ -12,12 +12,12 @@ internal sealed class UInt64Solver : ISolver private readonly BDD[] _minterms; internal readonly MintermClassifier _classifier; - public UInt64Solver(BDD[] minterms, CharSetSolver solver) + public UInt64Solver(BDD[] minterms) { Debug.Assert(minterms.Length <= 64); _minterms = minterms; - _classifier = new MintermClassifier(minterms, solver); + _classifier = new MintermClassifier(minterms); Full = minterms.Length == 64 ? ulong.MaxValue : ulong.MaxValue >> (64 - minterms.Length); } diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Ctor.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Ctor.Tests.cs index cefad992523428..b9659996a4e517 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Ctor.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Ctor.Tests.cs @@ -133,7 +133,7 @@ public static void Ctor_Invalid() Assert.Throws(() => new Regex(@"(?>a*)a", RegexHelpers.RegexOptionNonBacktracking)); // NonBacktracking and atomics Assert.Throws(() => new Regex(@"\Ga", RegexHelpers.RegexOptionNonBacktracking)); // NonBacktracking and start anchors Assert.Throws(() => new Regex(@"(?A)(?<-C>B)$", RegexHelpers.RegexOptionNonBacktracking)); // NonBacktracking and balancing groups - Assert.Throws(() => new Regex(@"\w{1,1001}", RegexHelpers.RegexOptionNonBacktracking)); // Potentially large automata expansion + Assert.Throws(() => new Regex(@"\w{1,100001}", RegexHelpers.RegexOptionNonBacktracking)); // Potentially large automata expansion } } diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs index 57780531253d35..1f0e2932c6425d 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs @@ -1402,7 +1402,7 @@ public void NonBacktracking_NoEndAnchorMatchAtTimeoutCheck() { // This constant must be at least as large as the one in the implementation that sets the maximum number // of innermost loop iterations between timeout checks. - const int CharsToTriggerTimeoutCheck = 10000; + const int CharsToTriggerTimeoutCheck = 200_000; // Check that it is indeed large enough to trigger timeouts. If this fails the constant above needs to be larger. Assert.Throws(() => new Regex("a*", RegexHelpers.RegexOptionNonBacktracking, TimeSpan.FromTicks(1)) .Match(new string('a', CharsToTriggerTimeoutCheck))); @@ -2653,5 +2653,25 @@ public static IEnumerable MatchWordsInAnchoredRegexes_TestData() yield return new object[] { engine, RegexOptions.Multiline, @"\b\d{1,2}\/\d{1,2}\/\d{2,4}$", "date 10/12/1966\nand 10/12/66\nare the same", new (int, int)[] { (5, 10), (20, 8) } }; } } + + [Fact] + public async Task MatchNonBacktrackingOver255Minterms() + { + // While valid on all engines, this test in particular is designed to exercise the rare case + // of more than 255 unique minterms case in the non-backtracking engine's minterm classifier. + + IEnumerable chars = Enumerable.Range(128, 400 - 128).Select(i => (char)i); + string patternString = string.Concat(chars.Select(c => $"{c}{c}?")); // adding an optional char as well just so it's not a string literal + string inputString = string.Concat(chars); + + foreach (RegexEngine engine in RegexHelpers.AvailableEngines) + { + Regex r = await RegexHelpers.GetRegexAsync(engine, patternString); + MatchCollection ms = r.Matches(inputString); + Assert.Equal(1, ms.Count); + Assert.Equal(0, ms[0].Index); + Assert.Equal(272, ms[0].Length); + } + } } } diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs index 0e7046a04f36dd..c14e5e366e53b0 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/SymbolicRegexTests.cs @@ -209,13 +209,13 @@ public static IEnumerable UnsafeThresholdTests_MemberData() [ // simple counters that are too large "((ab){0,9000})", - "((ab){1000})", + "((ab){5000})", "((ab){100,5000})", // almost infinite lower bound "a{2147483646,}", // 2147483646 = int.MaxValue-1 // nested small counters causing unsafe blowup through multiplicative nature of counter nesting - "(((ab){10}){10}){10}", // more than 10^3 - "((((abcd){4}){4}){4}){4}", // exponential: more than 4^5 = 1024 + "(((ab){10}){10}){50}", // more than 10^3 * 5 + "(((((abcd){4}){4}){4}){4}){10}", // exponential: more than 4^5 * 10 = 10240 // combined large counters "((ab){1000}){1000}", // more than 1000^2 "((ab){99999999}){99999999}", // multiply: much more than int.MaxValue