Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Changes from 1 commit
Commits
Show all changes
63 commits
Select commit Hold shift + click to select a range
34eba54
Regex automata optimizations
ieviev May 24, 2024
49607f4
off by one err
ieviev May 24, 2024
5ac29f3
wip reversal optimizations
ieviev May 26, 2024
e440dec
removing unnecessary overhead
ieviev May 26, 2024
627fd90
handle final position correctly
ieviev May 26, 2024
7ae6440
edge case workarounds, tests should be ok again
ieviev May 27, 2024
383f3e5
optimizing lookup initialization
ieviev May 27, 2024
5a2636c
more dfa overhead removed
ieviev May 28, 2024
57e5b8d
removed potential rewrite
ieviev May 28, 2024
4d275db
low memory variant
ieviev May 28, 2024
c35ed7e
some kind of compromise between speed and memory
ieviev May 28, 2024
868e02d
cheaper nullability checks
ieviev May 29, 2024
14afd18
nullability encoding
ieviev May 29, 2024
5f5ab55
nullability cached as bytes
ieviev May 29, 2024
dd121de
reverting some changes
ieviev May 30, 2024
723c5b6
testing nfa fallback
ieviev Jun 5, 2024
6bf4095
refactoring, work in progress
ieviev Jun 17, 2024
b10e600
refactoring to struct interfaces
ieviev Jun 18, 2024
d68bd3c
refactoring optimizations
ieviev Jun 18, 2024
153dfc3
fallback mode and bugfix
ieviev Jun 18, 2024
4aebe3e
reenable warnings
ieviev Jun 18, 2024
1e6f55c
anchor edge case
ieviev Jun 19, 2024
c6ad3ac
anchor edge cases
ieviev Jun 19, 2024
e10b43f
Apply suggestions from code review
ieviev Jun 19, 2024
f581755
Apply suggestions from code review
ieviev Jun 27, 2024
01a9684
rebased branch and some cleanup
ieviev Jun 27, 2024
341ce27
cleanup, removing unused features
ieviev Jun 27, 2024
1a28c69
cleanup
ieviev Jun 27, 2024
9bba84f
timeout limit changes
ieviev Jun 29, 2024
a957781
lookup allocation threshold and timeout limits
ieviev Jun 30, 2024
7e86855
char mapping
ieviev Jun 30, 2024
99b5717
empty array mapping
ieviev Jun 30, 2024
47c6b04
adding timeout check to create-derivative
ieviev Jun 30, 2024
22d23fa
some cleanup
ieviev Jun 30, 2024
761f897
comments and cleanup
ieviev Jun 30, 2024
53924eb
cleanup and comments
ieviev Jun 30, 2024
e66d3d3
reflecting new limits in tests
ieviev Jul 1, 2024
65c0b8b
rerunning tests
ieviev Jul 1, 2024
de085b4
retesting DFA timeout
ieviev Jul 1, 2024
5ef3b32
more precise regex memory limit for DFA mode
ieviev Jul 2, 2024
281446f
reverting change
ieviev Jul 2, 2024
8f78046
reverting reversal refactor
ieviev Jul 3, 2024
7157520
Apply suggestions from code review
ieviev Jul 3, 2024
931552d
variable naming
ieviev Jul 3, 2024
cc493f1
test for over 255 minterms
ieviev Jul 3, 2024
a0d2390
adding net directive around test
ieviev Jul 3, 2024
0691c58
all engines in minterms test
ieviev Jul 3, 2024
8ceb207
Apply suggestions from code review
ieviev Jul 3, 2024
379519b
Apply suggestions from code review
ieviev Jul 3, 2024
57c8f6d
simplifying code
ieviev Jul 3, 2024
2e57d42
state flag values down
ieviev Jul 3, 2024
60b1352
mintermclassifier changes
ieviev Jul 3, 2024
2900aad
reversal
ieviev Jul 4, 2024
764ded8
getstateflags
ieviev Jul 4, 2024
81d0dca
formatting
ieviev Jul 4, 2024
38f28b9
removing unused interface
ieviev Jul 4, 2024
cce1188
local function typo
ieviev Jul 4, 2024
8b946da
temporarily removing minterms test
ieviev Jul 5, 2024
d3430b3
re-adding minterms test
ieviev Jul 6, 2024
388c256
reenabling test for all engines
ieviev Jul 8, 2024
2704641
test bugfix
ieviev Jul 8, 2024
0abaabe
expected matches change
ieviev Jul 8, 2024
0a0f409
Review and clean up some code
stephentoub Jul 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Apply suggestions from code review
Co-authored-by: Stephen Toub <[email protected]>
  • Loading branch information
ieviev and stephentoub committed Jul 10, 2024
commit 8ceb20767ee48052122db04dfce3c1ca1e645476
Original file line number Diff line number Diff line change
Expand Up @@ -510,21 +510,22 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
{
i -= _optimizedReversalState.FixedLength;
reversalStartState = new CurrentState(_optimizedReversalState.AdjustedStartState!);

// reversal may already be nullable here in the case of anchors
if (_containsAnyAnchor && _nullabilityArray[reversalStartState.DfaStateId] > 0)
{
if (FullNullabilityHandler.IsNullableAt<DfaStateHandler>(this,
in reversalStartState, FullInputReader.GetPositionId(this, input, i),
DfaStateHandler.GetStateFlags(this, in reversalStartState)))
in reversalStartState, FullInputReader.GetPositionId(this, input, i),
DfaStateHandler.GetStateFlags(this, in reversalStartState)))
{
initialLastStart = i;
}
}
}
else
{
reversalStartState = new CurrentState(_reverseInitialStates[
GetCharKind<FullInputReader>(input, matchEnd)]);
reversalStartState = new CurrentState(_reverseInitialStates[GetCharKind<FullInputReader>(input, matchEnd)]);

}
matchStart = matchEnd < startat
? startat
Expand Down Expand Up @@ -567,9 +568,7 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
}

/// <summary>
/// This version of <see cref="FindEndPositionFallback"/> uses a different set of interfaces,
/// which don't check for many inner loop edge cases e.g. input end or '\n'.
/// All edge cases are handled before entering the loop.
/// Streamlined version of <see cref="FindEndPositionFallback"/> that doesn't handle /z anchors or very large sets of minterms.
/// </summary>
private int FindEndPositionOptimized<TOptimizedInputReader, TAcceleratedStateHandler, TOptimizedNullabilityHandler>(
ReadOnlySpan<char> input, int pos, long timeoutOccursAt, RegexRunnerMode mode, PerThreadData perThreadData)
Expand All @@ -590,7 +589,7 @@ private int FindEndPositionOptimized<TOptimizedInputReader, TAcceleratedStateHan
bool done;
if (currentState.NfaState is null)
{
const int DfaCharsPerTimeoutCheck = 100000;
const int DfaCharsPerTimeoutCheck = 100_000;
innerLoopLength = _checkTimeout && lengthMinus1 - pos > DfaCharsPerTimeoutCheck
? pos + DfaCharsPerTimeoutCheck
: lengthMinus1;
Expand All @@ -603,7 +602,6 @@ private int FindEndPositionOptimized<TOptimizedInputReader, TAcceleratedStateHan
else
{
// nfa fallback check, assume \Z and full nullability for nfa since it's already extremely rare to get here
// worst case NFA speed is about 150 kb/s, this means the check is about every 13ms
const int NfaCharsPerTimeoutCheck = 1000;
innerLoopLength = _checkTimeout && input.Length - pos > NfaCharsPerTimeoutCheck
? pos + NfaCharsPerTimeoutCheck
Expand Down Expand Up @@ -772,10 +770,8 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
}

if (TAcceleratedStateHandler.TryFindNextStartingPosition<TOptimizedInputReader>(
this, mtlookup, input, ref currStateId, ref pos, initialStateId))
this, mtlookup, input, ref currStateId, ref pos, initialStateId))
{
// a good potential future optimization here would
// be to combine this with an immediate state transition
if (pos == input.Length)
{
// patterns such as ^$ can be nullable right away
Expand All @@ -784,16 +780,18 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
// the end position kind was nullable
endPos = pos;
}

currStateId = deadStateId;
continue;
}
}

// If the state is nullable for the next character, we found a potential end state.
if (TOptimizedNullabilityHandler.IsNullable<TOptimizedInputReader>(this, _nullabilityArray, currStateId, mtlookup,
maxChar, input, pos))
if (TOptimizedNullabilityHandler.IsNullable<TOptimizedInputReader>(
this, _nullabilityArray, currStateId, mtlookup, maxChar, input, pos))
{
endPos = pos;

// A match is known to exist. If that's all we need to know, we're done.
if (mode == RegexRunnerMode.ExistenceRequired)
{
Expand All @@ -804,27 +802,28 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
// If there is more input available try to transition with the next character.
// Note: the order here is important so the transition itself gets taken
if (!DfaStateHandler.TryTakeDFATransition(
this, ref currStateId, TOptimizedInputReader.GetPositionId(mtlookup, maxChar, input, pos),
timeoutOccursAt)
|| pos >= lengthMinus1)
this, ref currStateId, TOptimizedInputReader.GetPositionId(mtlookup, maxChar, input, pos), timeoutOccursAt) ||
pos >= lengthMinus1)
{
if (pos + 1 < input.Length)
{
return false;
}
pos++;

// one off check for the final position
// this is just to move it out of the hot loop
if (!(_stateFlagsArray[currStateId].IsNullable() ||
_stateArray[currStateId]!.IsNullableFor(
GetPositionKind(-1))))
_stateArray[currStateId]!.IsNullableFor(GetPositionKind(-1))))
{
return true;

}
// the end position (-1) was nullable
endPos = pos;
return true;
}

// We successfully transitioned, so update our current input index to match.
pos++;
}
Expand Down Expand Up @@ -856,8 +855,7 @@ private int FindEndPositionFallback<TInputReader, TFindOptimizationsHandler, TNu
/// A negative value if iteration completed because we ran out of input or we failed to transition.
/// </returns>
private bool FindEndPositionDeltasDFA<TStateHandler, TInputReader, TFindOptimizationsHandler, TNullabilityHandler>(ReadOnlySpan<char> input, int length, RegexRunnerMode mode,
long timeoutOccursAt, ref int posRef, ref CurrentState state, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef
)
long timeoutOccursAt, ref int posRef, ref CurrentState state, ref int endPosRef, ref int initialStatePosRef, ref int initialStatePosCandidateRef)
where TStateHandler : struct, IStateHandler
where TInputReader : struct, IInputReader
where TFindOptimizationsHandler : struct, IInitialStateHandler
Expand All @@ -879,6 +877,7 @@ private bool FindEndPositionDeltasDFA<TStateHandler, TInputReader, TFindOptimiza
{
return true;
}

if (state.DfaStateId == initialStateId)
{
if (!TFindOptimizationsHandler.TryFindNextStartingPosition<TInputReader>(this, input, ref state, ref pos))
Expand All @@ -893,9 +892,10 @@ private bool FindEndPositionDeltasDFA<TStateHandler, TInputReader, TFindOptimiza
// If the state is nullable for the next character, meaning it accepts the empty string,
// we found a potential end state.
if (TNullabilityHandler.IsNullableAt<TStateHandler>(this, in state,
positionId, TStateHandler.GetStateFlags(this, in state)))
positionId, TStateHandler.GetStateFlags(this, in state)))
{
endPos = pos;

// endStateId = TStateHandler.ExtractNullableCoreStateId(this, in state, input, pos);
initialStatePos = initialStatePosCandidate;

Expand All @@ -907,8 +907,7 @@ private bool FindEndPositionDeltasDFA<TStateHandler, TInputReader, TFindOptimiza
}

// If there is more input available try to transition with the next character.
if (pos >= length || !TStateHandler.TryTakeTransition(this, ref state,
positionId, timeoutOccursAt))
if (pos >= length || !TStateHandler.TryTakeTransition(this, ref state, positionId, timeoutOccursAt))
{
return false;
}
Expand All @@ -922,7 +921,6 @@ private bool FindEndPositionDeltasDFA<TStateHandler, TInputReader, TFindOptimiza
// Write back the local copies of the ref values.
posRef = pos;
endPosRef = endPos;
// endStateIdRef = endStateId;
initialStatePosRef = initialStatePos;
initialStatePosCandidateRef = initialStatePosCandidate;
}
Expand Down Expand Up @@ -1078,10 +1076,11 @@ private bool FindStartPositionDeltasDFA<TStateHandler, TInputReader, TNullabilit
while (true)
{
int positionId = TInputReader.GetPositionId(this, input, pos - 1);

// If the state accepts the empty string, we found a valid starting position. Record it and keep going,
// since we're looking for the earliest one to occur within bounds.
if (_nullabilityArray[state.DfaStateId] > 0 && TNullabilityHandler.IsNullableAt<TStateHandler>(this, in state, positionId,
TStateHandler.GetStateFlags(this, in state)))
if (_nullabilityArray[state.DfaStateId] > 0 &&
TNullabilityHandler.IsNullableAt<TStateHandler>(this, in state, positionId, TStateHandler.GetStateFlags(this, in state)))
{
lastStart = pos;
}
Expand Down Expand Up @@ -1128,8 +1127,7 @@ private bool FindStartPositionDeltasNFA<TStateHandler, TInputReader, TNullabilit

// If the state accepts the empty string, we found a valid starting position. Record it and keep going,
// since we're looking for the earliest one to occur within bounds.
if (TNullabilityHandler.IsNullableAt<TStateHandler>(this, in state, positionId,
TStateHandler.GetStateFlags(this, in state)))
if (TNullabilityHandler.IsNullableAt<TStateHandler>(this, in state, positionId, TStateHandler.GetStateFlags(this, in state)))
{
lastStart = pos;
}
Expand Down Expand Up @@ -1473,7 +1471,8 @@ public static bool TryTakeTransition(SymbolicRegexMatcher<TSet> matcher, ref Cur
public static bool TryTakeDFATransition(SymbolicRegexMatcher<TSet> matcher, ref int state,
int mintermId, long timeoutOccursAt)
{
Debug.Assert(state > 0, $"Expected non-zero {nameof(state)}.");
Debug.Assert(state > 0, $"Expected {nameof(state)} {state} > 0");

// Use the mintermId for the character being read to look up which state to transition to.
// If that state has already been materialized, move to it, and we're done. If that state
// hasn't been materialized, try to create it; if we can, move to it, and we're done.
Expand All @@ -1487,14 +1486,15 @@ public static bool TryTakeDFATransition(SymbolicRegexMatcher<TSet> matcher, ref
}

if (matcher.TryCreateNewTransition(matcher.GetState(state), mintermId,
matcher.DeltaOffset(state, mintermId),
checkThreshold: true, out MatchingState<TSet>? nextState, timeoutOccursAt))
matcher.DeltaOffset(state, mintermId),
checkThreshold: true, out MatchingState<TSet>? nextState, timeoutOccursAt))
{
// We were able to create a new DFA transition to some state. Move to it and
// return that we're still operating as a DFA and can keep going.
state = nextState.Id;
return true;
}

return false;
}

Expand Down Expand Up @@ -1703,8 +1703,7 @@ public static void UndoTransition(ref CurrentState state)
/// </summary>
private interface IOptimizedInputReader
{
public static abstract int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan<char> input,
int pos);
public static abstract int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan<char> input, int pos);
}

/// <summary>
Expand All @@ -1717,7 +1716,8 @@ public static int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan<char> i
{
Debug.Assert(pos < input.Length, "pos < input.Length");
Debug.Assert(maxChar <= (lookup.Length + 1), $"maxChar = {maxChar}; lookup.Length = {lookup.Length}");
return input[pos] > maxChar ? 0 : lookup[input[pos]];
char c = input[pos];
return (uint)c < (uint)lookup.Length ? lookup[c] : 0;
}
}

Expand All @@ -1742,8 +1742,7 @@ public static int GetPositionId(byte[] lookup, int maxChar, ReadOnlySpan<char> i
private interface IOptimizedNullabilityHandler
{
public static abstract bool IsNullable<TOptimizedInputReader>(SymbolicRegexMatcher<TSet> matcher,
byte[] nullabilityArray, int
currStateId, byte[] lookup, int maxChar, ReadOnlySpan<char> input, int pos)
byte[] nullabilityArray, int currStateId, byte[] lookup, int maxChar, ReadOnlySpan<char> input, int pos)
where TOptimizedInputReader : struct, IOptimizedInputReader;
}

Expand All @@ -1755,8 +1754,7 @@ public static bool IsNullable<TOptimizedInputReader>(SymbolicRegexMatcher<TSet>
where TOptimizedInputReader : struct, IOptimizedInputReader
{
Debug.Assert(pos < input.Length, "input end should not be handled here");
Debug.Assert(currStateId < nullabilityArray.Length,
"nullabilityArray grown but the reference is not up to date");
Debug.Assert(currStateId < nullabilityArray.Length, "nullabilityArray grown but the reference is not up to date");
return nullabilityArray[currStateId] > 0;
}
}
Expand All @@ -1770,8 +1768,9 @@ public static bool IsNullable<TOptimizedInputReader>(SymbolicRegexMatcher<TSet>
{
Debug.Assert(pos < input.Length, "input end should not be handled here");
Debug.Assert(currStateId < nullabilityArray.Length, "nullabilityArray grown but the reference is not up to date");
return nullabilityArray[currStateId] > 0 && matcher.IsNullableWithContext(currStateId, TOptimizedInputReader.GetPositionId(lookup,
maxChar, input, pos));
return
nullabilityArray[currStateId] > 0 &&
matcher.IsNullableWithContext(currStateId, TOptimizedInputReader.GetPositionId(lookup, maxChar, input, pos));
}
}

Expand Down Expand Up @@ -1827,22 +1826,22 @@ public static abstract bool TryFindNextStartingPosition<TInputReader>(SymbolicRe
private interface IAcceleratedStateHandler
{
public static abstract bool TryFindNextStartingPosition<TOptimizedInputReader>(
SymbolicRegexMatcher<TSet> matcher, byte[] lookup, ReadOnlySpan<char> input, ref
int currentStateId, ref int pos, int initialStateId)
SymbolicRegexMatcher<TSet> matcher, byte[] lookup, ReadOnlySpan<char> input,
ref int currentStateId, ref int pos, int initialStateId)
where TOptimizedInputReader : struct, IOptimizedInputReader;
}

private readonly struct NoAnchorAcceleratedStateHandler : IAcceleratedStateHandler
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool TryFindNextStartingPosition<TOptimizedInputReader>(SymbolicRegexMatcher<TSet> matcher,
byte[] lookup,
ReadOnlySpan<char> input, ref int currentStateId, ref int pos, int initialStateId)
byte[] lookup, ReadOnlySpan<char> input, ref int currentStateId, ref int pos, int initialStateId)
where TOptimizedInputReader : struct, IOptimizedInputReader

{
if (currentStateId != initialStateId)
{
return false;
}

if (matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0))
{
Expand Down