From 49933992f260b0e5239542a8aed64fc6886e0c92 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Fri, 4 Nov 2022 17:48:49 -0400 Subject: [PATCH] Improve IndexOf handling in regex source generator / compiler This PR does a few related things: 1. Consolidates _most_ (but not all) use of IndexOf variants into a single helper that can then be used from multiple locations to avoid code duplication and make it easier for us to extend in the future with additional IndexOf variants. 2. Stops using IndexOf when doing lazy backtracking in an optional. 3. Special-cases "any" repeaters to not do any character checking. 4. Adds use of IndexOf (via the new helper) into repeaters. --- .../gen/RegexGenerator.Emitter.cs | 262 +++++----- .../Text/RegularExpressions/RegexCompiler.cs | 492 +++++++++--------- .../Text/RegularExpressions/RegexNode.cs | 71 ++- 3 files changed, 433 insertions(+), 392 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index e758cac88d68e0..16bf09065ae847 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -1,19 +1,16 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. -using System.Buffers.Binary; using System.CodeDom.Compiler; using System.Collections; using System.Collections.Generic; using System.Collections.Immutable; using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; using System.Globalization; using System.IO; using System.Linq; -using System.Net.Cache; -using System.Runtime.InteropServices; using System.Threading; -using System.Web; using Microsoft.CodeAnalysis; using Microsoft.CodeAnalysis.CSharp; @@ -2891,33 +2888,19 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL // We're backtracking. Check the timeout. EmitTimeoutCheckIfNeeded(writer, rm); - if (!rtl && subsequent?.FindStartingLiteral() is RegexNode.StartingLiteralData literal) + if (!rtl && + node.N > 1 && // no point in using IndexOf for small loops, in particular optionals + subsequent?.FindStartingLiteralNode() is RegexNode literalNode && + TryEmitIndexOf(literalNode, useLast: true, negate: false, out int literalLength, out string indexOfExpr)) { writer.WriteLine($"if ({startingPos} >= {endingPos} ||"); - (string lastIndexOfName, string lastIndexOfAnyName) = !literal.Negated ? - ("LastIndexOf", "LastIndexOfAny") : - ("LastIndexOfAnyExcept", "LastIndexOfAnyExcept"); string setEndingPosCondition = $" ({endingPos} = inputSpan.Slice({startingPos}, "; - if (literal.String is not null) - { - setEndingPosCondition += $"Math.Min(inputSpan.Length, {endingPos} + {literal.String.Length - 1}) - {startingPos}).{lastIndexOfName}({Literal(literal.String)}"; - } - else - { - setEndingPosCondition += $"{endingPos} - {startingPos})."; - setEndingPosCondition += literal.SetChars is not null ? literal.SetChars.Length switch - { - 2 => $"{lastIndexOfAnyName}({Literal(literal.SetChars[0])}, {Literal(literal.SetChars[1])}", - 3 => $"{lastIndexOfAnyName}({Literal(literal.SetChars[0])}, {Literal(literal.SetChars[1])}, {Literal(literal.SetChars[2])}", - _ => $"{lastIndexOfAnyName}({Literal(literal.SetChars)}", - } : - literal.Range.LowInclusive == literal.Range.HighInclusive ? $"{lastIndexOfName}({Literal(literal.Range.LowInclusive)}" : - $"{lastIndexOfAnyName}InRange({Literal(literal.Range.LowInclusive)}, {Literal(literal.Range.HighInclusive)}"; - } - setEndingPosCondition += ")) < 0)"; + setEndingPosCondition = literalLength > 1 ? + $"{setEndingPosCondition}Math.Min(inputSpan.Length, {endingPos} + {literalLength - 1}) - {startingPos})" : + $"{setEndingPosCondition}{endingPos} - {startingPos})"; - using (EmitBlock(writer, setEndingPosCondition)) + using (EmitBlock(writer, $"{setEndingPosCondition}.{indexOfExpr}) < 0)")) { Goto(doneLabel); } @@ -3098,7 +3081,7 @@ literal.SetChars is not null || (false, _) => $"{startingPos} = {sliceSpan}.IndexOfAny({Literal($"{node.Ch}{literal.SetChars}")});", }); } - else if (literal.Range.LowInclusive == literal.Range.HighInclusive) // single char + else if (literal.Range.LowInclusive == literal.Range.HighInclusive) // single char from a RegexNode.One { overlap = literal.Range.LowInclusive == node.Ch; writer.WriteLine(overlap ? @@ -3131,26 +3114,13 @@ literal.SetChars is not null || else if (iterationCount is null && node.Kind is RegexNodeKind.Setlazy && node.Str == RegexCharClass.AnyClass && - subsequent?.FindStartingLiteral() is RegexNode.StartingLiteralData literal2) + subsequent?.FindStartingLiteralNode() is RegexNode literal2 && + TryEmitIndexOf(literal2, useLast: false, negate: false, out _, out string? indexOfExpr)) { // e.g. ".*?string" with RegexOptions.Singleline // This lazy loop will consume all characters until the subsequent literal. If the subsequent literal // isn't found, the loop fails. We can implement it to just search for that literal. - (string indexOfName, string indexOfAnyName) = !literal2.Negated ? - ("IndexOf", "IndexOfAny") : - ("IndexOfAnyExcept", "IndexOfAnyExcept"); - writer.WriteLine($"{startingPos} = {sliceSpan}."); - writer.WriteLine( - literal2.String is not null ? $"{indexOfName}({Literal(literal2.String)});" : - literal2.SetChars is not null ? literal2.SetChars.Length switch - { - 2 => $"{indexOfAnyName}({Literal(literal2.SetChars[0])}, {Literal(literal2.SetChars[1])});", - 3 => $"{indexOfAnyName}({Literal(literal2.SetChars[0])}, {Literal(literal2.SetChars[1])}, {Literal(literal2.SetChars[2])});", - _ => $"{indexOfAnyName}({Literal(literal2.SetChars)});", - } : - literal2.Range.LowInclusive == literal2.Range.HighInclusive ? $"{indexOfName}({Literal(literal2.Range.LowInclusive)});" : - $"{indexOfAnyName}InRange({Literal(literal2.Range.LowInclusive)}, {Literal(literal2.Range.HighInclusive)});"); - + writer.WriteLine($"{startingPos} = {sliceSpan}.{indexOfExpr};"); using (EmitBlock(writer, $"if ({startingPos} < 0)")) { Goto(doneLabel); @@ -3543,6 +3513,15 @@ void EmitSingleCharRepeater(RegexNode node, bool emitLengthCheck = true) EmitSingleChar(node); } } + else if (node.IsSetFamily && node.Str == RegexCharClass.AnyClass) + { + // This is a repeater for anything, which means we only care about length and can jump past that length. + if (emitLengthCheck) + { + EmitSpanLengthCheck(iterations); + } + sliceStaticPos += iterations; + } else if (iterations <= MaxUnrollSize) { // if ((uint)(sliceStaticPos + iterations - 1) >= (uint)slice.Length || @@ -3577,20 +3556,37 @@ void EmitSingleCharRepeater(RegexNode node, bool emitLengthCheck = true) if (emitLengthCheck) { EmitSpanLengthCheck(iterations); + writer.WriteLine(); } - string repeaterSpan = "repeaterSlice"; // As this repeater doesn't wrap arbitrary node emits, this shouldn't conflict with anything - writer.WriteLine($"ReadOnlySpan {repeaterSpan} = {sliceSpan}.Slice({sliceStaticPos}, {iterations});"); - using (EmitBlock(writer, $"for (int i = 0; i < {repeaterSpan}.Length; i++)")) + // If we're able to vectorize the search, do so. Otherwise, fall back to a loop. + // For the loop, we're validating that each char matches the target node. + // For IndexOf, we're looking for the first thing that _doesn't_ match the target node, + // and thus similarly validating that everything does. + if (TryEmitIndexOf(node, useLast: false, negate: true, out _, out string? indexOfExpr)) { - string tmpTextSpanLocal = sliceSpan; // we want EmitSingleChar to refer to this temporary - int tmpSliceStaticPos = sliceStaticPos; - sliceSpan = repeaterSpan; - sliceStaticPos = 0; - EmitSingleChar(node, emitLengthCheck: false, offset: "i"); - sliceSpan = tmpTextSpanLocal; - sliceStaticPos = tmpSliceStaticPos; + using (EmitBlock(writer, $"if ({sliceSpan}.Slice({sliceStaticPos}, {iterations}).{indexOfExpr} >= 0)")) + { + Goto(doneLabel); + } } + else + { + string repeaterSpan = "repeaterSlice"; // As this repeater doesn't wrap arbitrary node emits, this shouldn't conflict with anything + writer.WriteLine($"ReadOnlySpan {repeaterSpan} = {sliceSpan}.Slice({sliceStaticPos}, {iterations});"); + + using (EmitBlock(writer, $"for (int i = 0; i < {repeaterSpan}.Length; i++)")) + { + string tmpTextSpanLocal = sliceSpan; // we want EmitSingleChar to refer to this temporary + int tmpSliceStaticPos = sliceStaticPos; + sliceSpan = repeaterSpan; + sliceStaticPos = 0; + EmitSingleChar(node, emitLengthCheck: false, offset: "i"); + sliceSpan = tmpTextSpanLocal; + sliceStaticPos = tmpSliceStaticPos; + } + } + sliceStaticPos += iterations; } } @@ -3618,9 +3614,6 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired = int minIterations = node.M; int maxIterations = node.N; bool rtl = (node.Options & RegexOptions.RightToLeft) != 0; - - Span setChars = stackalloc char[5]; // 5 is max optimized by IndexOfAny today - int numSetChars = 0; string iterationLocal = ReserveName("iteration"); if (rtl) @@ -3655,61 +3648,6 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired = writer.WriteLine(); } } - else if ((node.IsOneFamily || node.IsNotoneFamily) && maxIterations == int.MaxValue) - { - // For One or Notone, we're looking for a specific character, as everything until we find - // it (or its negation in the case of One) is consumed by the loop. If we're unbounded, such as with ".*" and if we're case-sensitive, - // we can use the vectorized IndexOf{AnyExcept} to do the search, rather than open-coding it. The unbounded - // restriction is purely for simplicity; it could be removed in the future with additional code to - // handle the unbounded case. - - writer.Write($"int {iterationLocal} = {sliceSpan}"); - if (sliceStaticPos > 0) - { - writer.Write($".Slice({sliceStaticPos})"); - } - string op = node.IsNotoneFamily ? "IndexOf" : "IndexOfAnyExcept"; - writer.WriteLine($".{op}({Literal(node.Ch)});"); - - using (EmitBlock(writer, $"if ({iterationLocal} < 0)")) - { - writer.WriteLine(sliceStaticPos > 0 ? - $"{iterationLocal} = {sliceSpan}.Length - {sliceStaticPos};" : - $"{iterationLocal} = {sliceSpan}.Length;"); - } - writer.WriteLine(); - } - else if (node.IsSetFamily && - maxIterations == int.MaxValue && - (numSetChars = RegexCharClass.GetSetChars(node.Str!, setChars)) != 0) - { - // If the set contains only a few characters (if it contained 1 and was negated, it should - // have been reduced to a Notone), we can use an IndexOfAny{Except} to find any of the target characters. - // As with the notoneloopatomic above, the unbounded constraint is purely for simplicity. - Debug.Assert(numSetChars > 1); - - writer.Write($"int {iterationLocal} = {sliceSpan}"); - if (sliceStaticPos != 0) - { - writer.Write($".Slice({sliceStaticPos})"); - } - writer.WriteLine((numSetChars, RegexCharClass.IsNegated(node.Str!)) switch - { - (2, true) => $".IndexOfAny({Literal(setChars[0])}, {Literal(setChars[1])});", - (3, true) => $".IndexOfAny({Literal(setChars[0])}, {Literal(setChars[1])}, {Literal(setChars[2])});", - (_, true) => $".IndexOfAny({Literal(setChars.Slice(0, numSetChars).ToString())});", - (2, false) => $".IndexOfAnyExcept({Literal(setChars[0])}, {Literal(setChars[1])});", - (3, false) => $".IndexOfAnyExcept({Literal(setChars[0])}, {Literal(setChars[1])}, {Literal(setChars[2])});", - (_, false) => $".IndexOfAnyExcept({Literal(setChars.Slice(0, numSetChars).ToString())});", - }); - using (EmitBlock(writer, $"if ({iterationLocal} < 0)")) - { - writer.WriteLine(sliceStaticPos > 0 ? - $"{iterationLocal} = {sliceSpan}.Length - {sliceStaticPos};" : - $"{iterationLocal} = {sliceSpan}.Length;"); - } - writer.WriteLine(); - } else if (node.IsSetFamily && maxIterations == int.MaxValue && node.Str == RegexCharClass.AnyClass) { // .* was used with RegexOptions.Singleline, which means it'll consume everything. Just jump to the end. @@ -3718,20 +3656,18 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired = TransferSliceStaticPosToPos(); writer.WriteLine($"int {iterationLocal} = inputSpan.Length - pos;"); } - else if (node.IsSetFamily && - maxIterations == int.MaxValue && - RegexCharClass.TryGetSingleRange(node.Str!, out char rangeLowInclusive, out char rangeHighInclusive)) + else if (maxIterations == int.MaxValue && TryEmitIndexOf(node, useLast: false, negate: true, out _, out string indexOfExpr)) { - // If the set contains a single range, we can use an IndexOfAny{Except}InRange to find any of the target characters. - // As with the cases above, the unbounded constraint is purely for simplicity. - string indexOfMethod = RegexCharClass.IsNegated(node.Str!) ? "IndexOfAnyInRange" : "IndexOfAnyExceptInRange"; + // We're unbounded and we can use an IndexOf method to perform the search. The unbounded restriction is + // purely for simplicity; it could be removed in the future with additional code to handle that case. writer.Write($"int {iterationLocal} = {sliceSpan}"); if (sliceStaticPos != 0) { writer.Write($".Slice({sliceStaticPos})"); } - writer.WriteLine($".{indexOfMethod}({Literal(rangeLowInclusive)}, {Literal(rangeHighInclusive)});"); + writer.WriteLine($".{indexOfExpr};"); + using (EmitBlock(writer, $"if ({iterationLocal} < 0)")) { writer.WriteLine(sliceStaticPos > 0 ? @@ -3745,14 +3681,9 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired = // For everything else, do a normal loop. string expr = $"{sliceSpan}[{iterationLocal}]"; - if (node.IsSetFamily) - { - expr = MatchCharacterClass(options, expr, node.Str!, negate: false, additionalDeclarations, requiredHelpers); - } - else - { - expr = $"{expr} {(node.IsOneFamily ? "==" : "!=")} {Literal(node.Ch)}"; - } + expr = node.IsSetFamily ? + MatchCharacterClass(options, expr, node.Str!, negate: false, additionalDeclarations, requiredHelpers) : + $"{expr} {(node.IsOneFamily ? "==" : "!=")} {Literal(node.Ch)}"; if (minIterations != 0 || maxIterations != int.MaxValue) { @@ -4348,6 +4279,85 @@ private static void EmitTimeoutCheckIfNeeded(IndentedTextWriter writer, RegexMet } } + /// Tries to create an IndexOf expression for the node. + /// The RegexNode. If it's a loop, only the one/notone/set aspect of the node is factored in. + /// true to use LastIndexOf variants; false to use IndexOf variants. + /// true to search for the opposite of the node. + /// 0 if returns false. If it returns true, string.Length for a multi, otherwise 1. + /// The resulting expression if it returns true; otherwise, null. + /// true if an expression could be produced; otherwise, false. + private static bool TryEmitIndexOf( + RegexNode node, + bool useLast, bool negate, + out int literalLength, [NotNullWhen(true)] out string? indexOfExpr) + { + string last = useLast ? "Last" : ""; + + if (node.Kind == RegexNodeKind.Multi) + { + Debug.Assert(!negate, "Negation isn't appropriate for a multi"); + indexOfExpr = $"{last}IndexOf({Literal(node.Str)})"; + literalLength = node.Str.Length; + return true; + } + + if (node.IsOneFamily) + { + indexOfExpr = negate ? $"{last}IndexOfAnyExcept({Literal(node.Ch)})" : $"{last}IndexOf({Literal(node.Ch)})"; + literalLength = 1; + return true; + } + + if (node.IsNotoneFamily) + { + indexOfExpr = negate ? $"{last}IndexOf({Literal(node.Ch)})" : $"{last}IndexOfAnyExcept({Literal(node.Ch)})"; + literalLength = 1; + return true; + } + + if (node.IsSetFamily) + { + bool negated = RegexCharClass.IsNegated(node.Str) ^ negate; + + Span setChars = stackalloc char[5]; // current max that's vectorized + int setCharsCount; + if ((setCharsCount = RegexCharClass.GetSetChars(node.Str, setChars)) > 0) + { + (string indexOfName, string indexOfAnyName) = !negated ? + ("IndexOf", "IndexOfAny") : + ("IndexOfAnyExcept", "IndexOfAnyExcept"); + + setChars = setChars.Slice(0, setCharsCount); + indexOfExpr = setChars.Length switch + { + 1 => $"{last}{indexOfName}({Literal(setChars[0])})", + 2 => $"{last}{indexOfAnyName}({Literal(setChars[0])}, {Literal(setChars[1])})", + 3 => $"{last}{indexOfAnyName}({Literal(setChars[0])}, {Literal(setChars[1])}, {Literal(setChars[2])})", + _ => $"{last}{indexOfAnyName}({Literal(setChars.ToString())})", + }; + + literalLength = 1; + return true; + } + + if (RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive)) + { + string indexOfAnyInRangeName = !negated ? + "IndexOfAnyInRange" : + "IndexOfAnyExceptInRange"; + + indexOfExpr = $"{last}{indexOfAnyInRangeName}({Literal(lowInclusive)}, {Literal(highInclusive)})"; + + literalLength = 1; + return true; + } + } + + indexOfExpr = null; + literalLength = 0; + return false; + } + private static string MatchCharacterClass(RegexOptions options, string chExpr, string charClass, bool negate, HashSet additionalDeclarations, Dictionary requiredHelpers) { // We need to perform the equivalent of calling RegexRunner.CharInClass(ch, charClass), diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 2eee61f47e2ce9..0ed046e282fdd7 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -2792,7 +2792,7 @@ void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, LocalBuilder? o // if (loadedChar != ch) goto doneLabel; if (node.IsSetFamily) { - EmitMatchCharacterClass(node.Str!); + EmitMatchCharacterClass(node.Str); BrfalseFar(doneLabel); } else @@ -3176,7 +3176,10 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL BleFar(doneLabel); } - if (!rtl && subsequent?.FindStartingLiteral() is RegexNode.StartingLiteralData literal) + if (!rtl && + node.N > 1 && + subsequent?.FindStartingLiteralNode() is RegexNode literal && + CanEmitIndexOf(literal, out int literalLength)) { // endingPos = inputSpan.Slice(startingPos, Math.Min(inputSpan.Length, endingPos + literal.Length - 1) - startingPos).LastIndexOf(literal); // if (endingPos < 0) @@ -3185,65 +3188,28 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL // } Ldloca(inputSpan); Ldloc(startingPos); - if (literal.String is not null) + if (literalLength > 1) { - Debug.Assert(!literal.Negated, "strings should not be negated"); + // Math.Min(inputSpan.Length, endingPos + literal.Length - 1) - startingPos Ldloca(inputSpan); Call(s_spanGetLengthMethod); Ldloc(endingPos); - Ldc(literal.String.Length - 1); + Ldc(literalLength - 1); Add(); Call(s_mathMinIntInt); - Ldloc(startingPos); - Sub(); - Call(s_spanSliceIntIntMethod); - Ldstr(literal.String); - Call(s_stringAsSpanMethod); - Call(s_spanLastIndexOfSpan); } else { + // endingPos - startingPos Ldloc(endingPos); - Ldloc(startingPos); - Sub(); - Call(s_spanSliceIntIntMethod); - if (literal.SetChars is not null) - { - switch (literal.SetChars.Length) - { - case 2: - Ldc(literal.SetChars[0]); - Ldc(literal.SetChars[1]); - Call(literal.Negated ? s_spanLastIndexOfAnyExceptCharChar : s_spanLastIndexOfAnyCharChar); - break; - - case 3: - Ldc(literal.SetChars[0]); - Ldc(literal.SetChars[1]); - Ldc(literal.SetChars[2]); - Call(literal.Negated ? s_spanLastIndexOfAnyExceptCharCharChar : s_spanLastIndexOfAnyCharCharChar); - break; - - default: - Ldstr(literal.SetChars); - Call(s_stringAsSpanMethod); - Call(literal.Negated ? s_spanLastIndexOfAnyExceptSpan : s_spanLastIndexOfAnySpan); - break; - } - } - else if (literal.Range.LowInclusive == literal.Range.HighInclusive) - { - Ldc(literal.Range.LowInclusive); - Call(literal.Negated ? s_spanLastIndexOfAnyExceptChar : s_spanLastIndexOfChar); - } - else - { - Ldc(literal.Range.LowInclusive); - Ldc(literal.Range.HighInclusive); - Call(literal.Negated ? s_spanLastIndexOfAnyExceptInRange : s_spanLastIndexOfAnyInRange); - } } + Ldloc(startingPos); + Sub(); + Call(s_spanSliceIntIntMethod); + + EmitIndexOf(literal, useLast: true, negate: false); Stloc(endingPos); + Ldloc(endingPos); Ldc(0); BltFar(doneLabel); @@ -3487,7 +3453,7 @@ literal.SetChars is not null || break; } } - else if (literal.Range.LowInclusive == literal.Range.HighInclusive) // char literal + else if (literal.Range.LowInclusive == literal.Range.HighInclusive) // single char from a RegexNode.One { overlap = literal.Range.LowInclusive == node.Ch; if (overlap) @@ -3557,7 +3523,8 @@ literal.SetChars is not null || iterationCount is null && node.Kind is RegexNodeKind.Setlazy && node.Str == RegexCharClass.AnyClass && - subsequent?.FindStartingLiteral() is RegexNode.StartingLiteralData literal2) + subsequent?.FindStartingLiteralNode() is RegexNode literal2 && + CanEmitIndexOf(literal2, out _)) { // e.g. ".*?string" with RegexOptions.Singleline // This lazy loop will consume all characters until the subsequent literal. If the subsequent literal @@ -3565,50 +3532,7 @@ node.Kind is RegexNodeKind.Setlazy && // startingPos = slice.IndexOf(literal); Ldloc(slice); - if (literal2.String is not null) - { - Debug.Assert(!literal2.Negated, "strings should not be negated"); - Ldstr(literal2.String); - Call(s_stringAsSpanMethod); - Call(s_spanIndexOfSpan); - } - else if (literal2.SetChars is not null) - { - switch (literal2.SetChars.Length) - { - case 2: - Ldc(literal2.SetChars[0]); - Ldc(literal2.SetChars[1]); - Call(literal2.Negated ? s_spanIndexOfAnyExceptCharChar : s_spanIndexOfAnyCharChar); - break; - - case 3: - Ldc(literal2.SetChars[0]); - Ldc(literal2.SetChars[1]); - Ldc(literal2.SetChars[2]); - Call(literal2.Negated ? s_spanIndexOfAnyExceptCharCharChar : s_spanIndexOfAnyCharCharChar); - break; - - default: - Ldstr(literal2.SetChars); - Call(s_stringAsSpanMethod); - Call(literal2.Negated ? s_spanIndexOfAnyExceptSpan : s_spanIndexOfAnySpan); - break; - } - } - else - { - Ldc(literal2.Range.LowInclusive); - if (literal2.Range.LowInclusive == literal2.Range.HighInclusive) - { - Call(literal2.Negated ? s_spanIndexOfAnyExceptChar : s_spanIndexOfChar); - } - else - { - Ldc(literal2.Range.HighInclusive); - Call(literal2.Negated ? s_spanIndexOfAnyExceptInRange : s_spanIndexOfAnyInRange); - } - } + EmitIndexOf(node, useLast: false, negate: false); Stloc(startingPos); // if (startingPos < 0) goto doneLabel; @@ -4114,6 +4038,13 @@ void EmitSingleCharRepeater(RegexNode node, bool emitLengthChecksIfRequired = tr EmitSpanLengthCheck(iterations); } + // If this is a repeater for anything,we only care about length and can jump past that length. + if (node.IsSetFamily && node.Str == RegexCharClass.AnyClass) + { + sliceStaticPos += iterations; + return; + } + // Arbitrary limit for unrolling vs creating a loop. We want to balance size in the generated // code with other costs, like the (small) overhead of slicing to create the temp span to iterate. const int MaxUnrollSize = 16; @@ -4132,48 +4063,61 @@ void EmitSingleCharRepeater(RegexNode node, bool emitLengthChecksIfRequired = tr else { // ReadOnlySpan tmp = slice.Slice(sliceStaticPos, iterations); - // for (int i = 0; i < tmp.Length; i++) - // { - // TimeoutCheck(); - // if (tmp[i] != ch) goto Done; - // } - // sliceStaticPos += iterations; - - Label conditionLabel = DefineLabel(); - Label bodyLabel = DefineLabel(); - - using RentedLocalBuilder spanLocal = RentReadOnlySpanCharLocal(); Ldloca(slice); Ldc(sliceStaticPos); Ldc(iterations); Call(s_spanSliceIntIntMethod); - Stloc(spanLocal); - using RentedLocalBuilder iterationLocal = RentInt32Local(); - Ldc(0); - Stloc(iterationLocal); - BrFar(conditionLabel); + // If we're able to vectorize the search, do so. Otherwise, fall back to a loop. + // For the loop, we're validating that each char matches the target node. + // For IndexOf, we're looking for the first thing that _doesn't_ match the target node, + // and thus similarly validating that everything does. + if (CanEmitIndexOf(node, out _)) + { + // if (tmp.IndexOf(...) >= 0) goto doneLabel; + EmitIndexOf(node, useLast: false, negate: true); + Ldc(0); + BgeFar(doneLabel); + } + else + { + using RentedLocalBuilder spanLocal = RentReadOnlySpanCharLocal(); + Stloc(spanLocal); - MarkLabel(bodyLabel); + // for (int i = 0; i < tmp.Length; i++) + // { + // if (tmp[i] != ch) goto Done; + // } - LocalBuilder tmpTextSpanLocal = slice; // we want EmitSingleChar to refer to this temporary - int tmpTextSpanPos = sliceStaticPos; - slice = spanLocal; - sliceStaticPos = 0; - EmitSingleChar(node, emitLengthCheck: false, offset: iterationLocal); - slice = tmpTextSpanLocal; - sliceStaticPos = tmpTextSpanPos; + Label conditionLabel = DefineLabel(); + Label bodyLabel = DefineLabel(); - Ldloc(iterationLocal); - Ldc(1); - Add(); - Stloc(iterationLocal); + using RentedLocalBuilder iterationLocal = RentInt32Local(); + Ldc(0); + Stloc(iterationLocal); + BrFar(conditionLabel); - MarkLabel(conditionLabel); - Ldloc(iterationLocal); - Ldloca(spanLocal); - Call(s_spanGetLengthMethod); - BltFar(bodyLabel); + MarkLabel(bodyLabel); + + LocalBuilder tmpTextSpanLocal = slice; // we want EmitSingleChar to refer to this temporary + int tmpTextSpanPos = sliceStaticPos; + slice = spanLocal; + sliceStaticPos = 0; + EmitSingleChar(node, emitLengthCheck: false, offset: iterationLocal); + slice = tmpTextSpanLocal; + sliceStaticPos = tmpTextSpanPos; + + Ldloc(iterationLocal); + Ldc(1); + Add(); + Stloc(iterationLocal); + + MarkLabel(conditionLabel); + Ldloc(iterationLocal); + Ldloca(spanLocal); + Call(s_spanGetLengthMethod); + BltFar(bodyLabel); + } sliceStaticPos += iterations; } @@ -4202,14 +4146,9 @@ void EmitSingleCharAtomicLoop(RegexNode node) int minIterations = node.M; int maxIterations = node.N; bool rtl = (node.Options & RegexOptions.RightToLeft) != 0; - using RentedLocalBuilder iterationLocal = RentInt32Local(); - Label atomicLoopDoneLabel = DefineLabel(); - Span setChars = stackalloc char[5]; // max optimized by IndexOfAny today - int numSetChars = 0; - if (rtl) { TransferSliceStaticPosToPos(); // we don't use static position for rtl @@ -4242,7 +4181,7 @@ void EmitSingleCharAtomicLoop(RegexNode node) LdindU2(); if (node.IsSetFamily) { - EmitMatchCharacterClass(node.Str!); + EmitMatchCharacterClass(node.Str); BrfalseFar(atomicLoopDoneLabel); } else @@ -4277,103 +4216,6 @@ void EmitSingleCharAtomicLoop(RegexNode node) BrFar(bodyLabel); } } - else if ((node.IsOneFamily || node.IsNotoneFamily) && maxIterations == int.MaxValue) - { - // For One or Notone, we're looking for a specific character, as everything until we find - // it (or its negation in the case of One) is consumed by the loop. If we're unbounded, such as with ".*" and if we're case-sensitive, - // we can use the vectorized IndexOf{AnyExcept} to do the search, rather than open-coding it. The unbounded - // restriction is purely for simplicity; it could be removed in the future with additional code to - // handle the unbounded case. - - // int i = slice.Slice(sliceStaticPos).IndexOf(char); - if (sliceStaticPos > 0) - { - Ldloca(slice); - Ldc(sliceStaticPos); - Call(s_spanSliceIntMethod); - } - else - { - Ldloc(slice); - } - Ldc(node.Ch); - Call(node.IsNotoneFamily ? s_spanIndexOfChar : s_spanIndexOfAnyExceptChar); - Stloc(iterationLocal); - - // if (i >= 0) goto atomicLoopDoneLabel; - Ldloc(iterationLocal); - Ldc(0); - BgeFar(atomicLoopDoneLabel); - - // i = slice.Length - sliceStaticPos; - Ldloca(slice); - Call(s_spanGetLengthMethod); - if (sliceStaticPos > 0) - { - Ldc(sliceStaticPos); - Sub(); - } - Stloc(iterationLocal); - } - else if (node.IsSetFamily && - maxIterations == int.MaxValue && - (numSetChars = RegexCharClass.GetSetChars(node.Str!, setChars)) != 0) - { - // If the set contains only a few characters (if it contained 1 and was negated, it should - // have been reduced to a Notone), we can use an IndexOfAny{Except} to find any of the target characters. - // As with the notoneloopatomic above, the unbounded constraint is purely for simplicity. - Debug.Assert(numSetChars > 1); - bool negated = RegexCharClass.IsNegated(node.Str!); - - // int i = slice.Slice(sliceStaticPos).IndexOfAny(ch1, ch2, ...); - if (sliceStaticPos > 0) - { - Ldloca(slice); - Ldc(sliceStaticPos); - Call(s_spanSliceIntMethod); - } - else - { - Ldloc(slice); - } - switch (numSetChars) - { - case 2: - Ldc(setChars[0]); - Ldc(setChars[1]); - Call(negated ? s_spanIndexOfAnyCharChar : s_spanIndexOfAnyExceptCharChar); - break; - - case 3: - Ldc(setChars[0]); - Ldc(setChars[1]); - Ldc(setChars[2]); - Call(negated ? s_spanIndexOfAnyCharCharChar : s_spanIndexOfAnyExceptCharCharChar); - break; - - default: - Ldstr(setChars.Slice(0, numSetChars).ToString()); - Call(s_stringAsSpanMethod); - Call(negated ? s_spanIndexOfAnySpan : s_spanIndexOfAnyExceptSpan); - break; - } - Stloc(iterationLocal); - - // if (i >= 0) goto atomicLoopDoneLabel; - Ldloc(iterationLocal); - Ldc(0); - BgeFar(atomicLoopDoneLabel); - - // i = slice.Length - sliceStaticPos; - Ldloca(slice); - Call(s_spanGetLengthMethod); - if (sliceStaticPos > 0) - { - Ldc(sliceStaticPos); - Sub(); - } - Stloc(iterationLocal); - } else if (node.IsSetFamily && maxIterations == int.MaxValue && node.Str == RegexCharClass.AnyClass) { // .* was used with RegexOptions.Singleline, which means it'll consume everything. Just jump to the end. @@ -4387,14 +4229,12 @@ void EmitSingleCharAtomicLoop(RegexNode node) Sub(); Stloc(iterationLocal); } - else if (node.IsSetFamily && - maxIterations == int.MaxValue && - RegexCharClass.TryGetSingleRange(node.Str!, out char rangeLowInclusive, out char rangeHighInclusive)) + else if (maxIterations == int.MaxValue && CanEmitIndexOf(node, out _)) { - // If the set contains a single range, we can use an IndexOfAny{Except}InRange to find any of the target characters. - // As with the cases above, the unbounded constraint is purely for simplicity. + // We're unbounded and we can use an IndexOf method to perform the search. The unbounded restriction is + // purely for simplicity; it could be removed in the future with additional code to handle that case. - // int i = slice.Slice(sliceStaticPos).IndexOfAny{Except}InRange(rangeLowInclusive, rangeHighInclusive); + // int i = slice.Slice(sliceStaticPos).IndexOf(...); if (sliceStaticPos > 0) { Ldloca(slice); @@ -4405,9 +4245,8 @@ void EmitSingleCharAtomicLoop(RegexNode node) { Ldloc(slice); } - Ldc(rangeLowInclusive); - Ldc(rangeHighInclusive); - Call(RegexCharClass.IsNegated(node.Str!) ? s_spanIndexOfAnyInRange : s_spanIndexOfAnyExceptInRange); + + EmitIndexOf(node, useLast: false, negate: true); Stloc(iterationLocal); // if (i >= 0) goto atomicLoopDoneLabel; @@ -4457,7 +4296,7 @@ void EmitSingleCharAtomicLoop(RegexNode node) LdindU2(); if (node.IsSetFamily) { - EmitMatchCharacterClass(node.Str!); + EmitMatchCharacterClass(node.Str); BrfalseFar(atomicLoopDoneLabel); } else @@ -4579,7 +4418,7 @@ void EmitAtomicSingleCharZeroOrOne(RegexNode node) LdindU2(); if (node.IsSetFamily) { - EmitMatchCharacterClass(node.Str!); + EmitMatchCharacterClass(node.Str); BrfalseFar(skipUpdatesLabel); } else @@ -5013,6 +4852,175 @@ void EmitLoop(RegexNode node) } } + // Gets whether an IndexOf expression can be emitted for the node. + // The RegexNode. If it's a loop, only the one/notone/set aspect of the node is factored in. + // 0 if returns false. If it returns true, string.Length for a multi, otherwise 1. + // true if an IndexOf can be emitted; otherwise, false. + bool CanEmitIndexOf(RegexNode node, out int literalLength) + { + if (node.Kind == RegexNodeKind.Multi) + { + literalLength = node.Str!.Length; + return true; + } + + if (node.IsOneFamily || node.IsNotoneFamily) + { + literalLength = 1; + return true; + } + + if (node.IsSetFamily) + { + Span setChars = stackalloc char[5]; // current max that's vectorized + int setCharsCount; + if ((setCharsCount = RegexCharClass.GetSetChars(node.Str, setChars)) > 0) + { + literalLength = 1; + return true; + } + + if (RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive)) + { + literalLength = 1; + return true; + } + } + + literalLength = 0; + return false; + } + + // Emits the code for IndexOf call based on the node. + // The RegexNode. If it's a loop, only the one/notone/set aspect of the node is factored in. + // true to use LastIndexOf variants; false to use IndexOf variants. + // true to search for the opposite of the node. + void EmitIndexOf(RegexNode node, bool useLast, bool negate) + { + if (node.Kind == RegexNodeKind.Multi) + { + // IndexOf(span) + Debug.Assert(!negate, "Negation isn't appropriate for a multi"); + Ldstr(node.Str!); + Call(s_stringAsSpanMethod); + Call(useLast ? s_spanLastIndexOfSpan : s_spanIndexOfSpan); + return; + } + + if (node.IsOneFamily || node.IsNotoneFamily) + { + // IndexOf{AnyExcept}(char) + + if (node.IsNotoneFamily) + { + negate = !negate; + } + + Ldc(node.Ch); + Call((useLast, negate) switch + { + (false, false) => s_spanIndexOfChar, + (false, true) => s_spanIndexOfAnyExceptChar, + (true, false) => s_spanLastIndexOfChar, + (true, true) => s_spanLastIndexOfAnyExceptChar, + }); + return; + } + + if (node.IsSetFamily) + { + bool negated = RegexCharClass.IsNegated(node.Str) ^ negate; + + // IndexOfAny{Except}(ch1, ...) + Span setChars = stackalloc char[5]; // current max that's vectorized + int setCharsCount; + if ((setCharsCount = RegexCharClass.GetSetChars(node.Str, setChars)) > 0) + { + setChars = setChars.Slice(0, setCharsCount); + switch (setChars.Length) + { + case 1: + Ldc(setChars[0]); + Call((useLast, negated) switch + { + (false, false) => s_spanIndexOfChar, + (false, true) => s_spanIndexOfAnyExceptChar, + (true, false) => s_spanLastIndexOfChar, + (true, true) => s_spanLastIndexOfAnyExceptChar, + }); + return; + + case 2: + Ldc(setChars[0]); + Ldc(setChars[1]); + Call((useLast, negated) switch + { + (false, false) => s_spanIndexOfAnyCharChar, + (false, true) => s_spanIndexOfAnyExceptCharChar, + (true, false) => s_spanLastIndexOfAnyCharChar, + (true, true) => s_spanLastIndexOfAnyExceptCharChar, + }); + return; + + case 3: + Ldc(setChars[0]); + Ldc(setChars[1]); + Ldc(setChars[2]); + Call((useLast, negated) switch + { + (false, false) => s_spanIndexOfAnyCharCharChar, + (false, true) => s_spanIndexOfAnyExceptCharCharChar, + (true, false) => s_spanLastIndexOfAnyCharCharChar, + (true, true) => s_spanLastIndexOfAnyExceptCharCharChar, + }); + return; + + default: + Ldstr(setChars.ToString()); + Call(s_stringAsSpanMethod); + Call((useLast, negated) switch + { + (false, false) => s_spanIndexOfAnySpan, + (false, true) => s_spanIndexOfAnyExceptSpan, + (true, false) => s_spanLastIndexOfAnySpan, + (true, true) => s_spanLastIndexOfAnyExceptSpan, + }); + return; + } + } + + // IndexOfAny{Except}InRange + if (RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive)) + { + if (lowInclusive == highInclusive) + { + Ldc(lowInclusive); + Call((useLast, negated) switch + { + (false, false) => s_spanIndexOfChar, + (false, true) => s_spanIndexOfAnyExceptChar, + (true, false) => s_spanLastIndexOfChar, + (true, true) => s_spanLastIndexOfAnyExceptChar, + }); + return; + } + + Ldc(lowInclusive); + Ldc(highInclusive); + Call((useLast, negated) switch + { + (false, false) => s_spanIndexOfAnyInRange, + (false, true) => s_spanIndexOfAnyExceptInRange, + (true, false) => s_spanLastIndexOfAnyInRange, + (true, true) => s_spanLastIndexOfAnyExceptInRange, + }); + return; + } + } + + Debug.Fail("We should never get here. This method should only be called if CanEmitIndexOf returned true, and all of the same cases should be covered."); + } + // // If the expression contains captures, pops a crawl position from the stack and uncaptures // until that position is reached. diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index d2eef1c622f6d1..80ec75bda88098 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -1381,10 +1381,8 @@ public char FirstCharOfOneOrMulti() /// A tuple of data about the literal: only one of the Char/String/SetChars fields is relevant. /// The Negated value indicates whether the Char/SetChars should be considered exclusionary. /// - public StartingLiteralData? FindStartingLiteral(int maxSetCharacters = 5) // 5 is max optimized by IndexOfAny today + public RegexNode? FindStartingLiteralNode() { - Debug.Assert(maxSetCharacters >= 0 && maxSetCharacters <= 128, $"{nameof(maxSetCharacters)} == {maxSetCharacters} should be small enough to be stack allocated."); - RegexNode? node = this; while (true) { @@ -1394,31 +1392,12 @@ public char FirstCharOfOneOrMulti() { case RegexNodeKind.One: case RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or RegexNodeKind.Onelazy when node.M > 0: - return new StartingLiteralData(range: (node.Ch, node.Ch), @string: null, setChars: null, negated: false); - case RegexNodeKind.Notone: case RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Notonelazy when node.M > 0: - return new StartingLiteralData(range: (node.Ch, node.Ch), @string: null, setChars: null, negated: true); - - case RegexNodeKind.Multi: - return new StartingLiteralData(range: default, @string: node.Str, setChars: null, negated: false); - case RegexNodeKind.Set: case RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic or RegexNodeKind.Setlazy when node.M > 0: - Span setChars = stackalloc char[maxSetCharacters]; - int numChars; - if ((numChars = RegexCharClass.GetSetChars(node.Str!, setChars)) != 0) - { - setChars = setChars.Slice(0, numChars); - return new StartingLiteralData(range: default, @string: null, setChars: setChars.ToString(), negated: RegexCharClass.IsNegated(node.Str!)); - } - - if (RegexCharClass.TryGetSingleRange(node.Str!, out char lowInclusive, out char highInclusive)) - { - Debug.Assert(lowInclusive < highInclusive); - return new StartingLiteralData(range: (lowInclusive, highInclusive), @string: null, setChars: null, negated: RegexCharClass.IsNegated(node.Str!)); - } - break; + case RegexNodeKind.Multi: + return node; case RegexNodeKind.Atomic: case RegexNodeKind.Concatenate: @@ -1435,6 +1414,49 @@ public char FirstCharOfOneOrMulti() } } + /// Finds the guaranteed beginning literal(s) of the node, or null if none exists. + /// + /// A tuple of data about the literal: only one of the Char/String/SetChars fields is relevant. + /// The Negated value indicates whether the Char/SetChars should be considered exclusionary. + /// + public StartingLiteralData? FindStartingLiteral(int maxSetCharacters = 5) // 5 is max optimized by IndexOfAny today + { + Debug.Assert(maxSetCharacters >= 0 && maxSetCharacters <= 128, $"{nameof(maxSetCharacters)} == {maxSetCharacters} should be small enough to be stack allocated."); + + if (FindStartingLiteralNode() is RegexNode node) + { + switch (node.Kind) + { + case RegexNodeKind.One or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or RegexNodeKind.Onelazy: + return new StartingLiteralData(range: (node.Ch, node.Ch), @string: null, setChars: null, negated: false); + + case RegexNodeKind.Notone or RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Notonelazy: + return new StartingLiteralData(range: (node.Ch, node.Ch), @string: null, setChars: null, negated: true); + + case RegexNodeKind.Set or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic or RegexNodeKind.Setlazy: + Span setChars = stackalloc char[maxSetCharacters]; + int numChars; + if ((numChars = RegexCharClass.GetSetChars(node.Str!, setChars)) != 0) + { + setChars = setChars.Slice(0, numChars); + return new StartingLiteralData(range: default, @string: null, setChars: setChars.ToString(), negated: RegexCharClass.IsNegated(node.Str!)); + } + + if (RegexCharClass.TryGetSingleRange(node.Str!, out char lowInclusive, out char highInclusive)) + { + Debug.Assert(lowInclusive < highInclusive); + return new StartingLiteralData(range: (lowInclusive, highInclusive), @string: null, setChars: null, negated: RegexCharClass.IsNegated(node.Str!)); + } + break; + + case RegexNodeKind.Multi: + return new StartingLiteralData(range: default, @string: node.Str, setChars: null, negated: false); + } + } + + return null; + } + /// Data about a starting literal as returned by . public readonly struct StartingLiteralData { @@ -2767,6 +2789,7 @@ static bool ExceedsMaxDepthAllowedDepth(RegexNode node, int allowedDepth) } /// Gets whether the node is a Set/Setloop/Setloopatomic/Setlazy node. + [MemberNotNullWhen(true, nameof(Str))] public bool IsSetFamily => Kind is RegexNodeKind.Set or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic or RegexNodeKind.Setlazy; /// Gets whether the node is a One/Oneloop/Oneloopatomic/Onelazy node.