From e151e932f86e2ba3b0f97e0877170a219e94d1e2 Mon Sep 17 00:00:00 2001 From: Alois Kraus Date: Thu, 18 Oct 2018 22:53:06 +0200 Subject: [PATCH 1/5] Fix for Regex performance issue when compiled Regex is used with RegexOptions.IgnoreCase or RegexOptions.CultureInvariant. This saves over 40% in these cases. --- .../tests/RegexCompilationInfoTests.cs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/System.Text.RegularExpressions/tests/RegexCompilationInfoTests.cs b/src/System.Text.RegularExpressions/tests/RegexCompilationInfoTests.cs index ab7d512b9acc..2c7eb0b44707 100644 --- a/src/System.Text.RegularExpressions/tests/RegexCompilationInfoTests.cs +++ b/src/System.Text.RegularExpressions/tests/RegexCompilationInfoTests.cs @@ -3,6 +3,7 @@ // See the LICENSE file in the project root for more information. using System.Collections.Generic; +using System.Diagnostics; using Xunit; namespace System.Text.RegularExpressions.Tests @@ -65,6 +66,20 @@ public void MatchTimeout_GetSet_Throws(TimeSpan matchTimeout) regexCompilationInfo.MatchTimeout = matchTimeout); } + [Fact] + public void Compile_To_Assembly() + { + var rex = new Regex(".*(/Common.Controls.Wpf;component/Themes/)[^/]*.xaml", RegexOptions.Compiled | RegexOptions.IgnoreCase); + const string noMatch = "This is a long string which contains no matches at all. Sorry for that"; + var sw = Stopwatch.StartNew(); + for(int i=0;i<100*1000;i++) + { + Assert.False(rex.IsMatch(noMatch)); + } + sw.Stop(); + Console.WriteLine($"Did take: {sw.Elapsed.TotalSeconds}s"); + } + [Theory] [MemberData(nameof(MatchTimeout_GetSet_Success_MemberData))] public void MatchTimeout_GetSet_Success(TimeSpan matchTimeout) From 62d7b7c78973ecdfc8fcc20929b7954c160cca5b Mon Sep 17 00:00:00 2001 From: Alois Kraus Date: Thu, 18 Oct 2018 23:09:29 +0200 Subject: [PATCH 2/5] =?UTF-8?q?Fix=20for=20Regex=20performance=20issue=20w?= =?UTF-8?q?hen=20compiled=20Regex=20is=20used=20with=20Rege=E2=80=A6xOptio?= =?UTF-8?q?ns.IgnoreCase=20or=20RegexOptions.CultureInvariant.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This saves over 40% in these cases. --- .../Text/RegularExpressions/RegexCompiler.cs | 26 +++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 5f585ba7d343..3da864922acd 100644 --- a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -72,6 +72,7 @@ internal abstract class RegexCompiler private LocalBuilder _tempV; private LocalBuilder _temp2V; private LocalBuilder _temp3V; + private LocalBuilder _cultureV; // current culture is cached in local variable to prevent many thread local storage accesses for CultureInfo.CurrentCulture protected RegexCode _code; // the RegexCode object (used for debugging only) protected int[] _codes; // the RegexCodes being translated @@ -961,14 +962,20 @@ private void Advance() _ilg.Emit(OpCodes.Br, AdvanceLabel()); } - private void CallToLower() + private void InitLocalCultureInfo() { if ((_options & RegexOptions.CultureInvariant) != 0) Call(s_getInvariantCulture); else Call(s_getCurrentCulture); - Call(s_chartolowerM); + Stloc(_cultureV); + } + + private void CallToLower() + { + Ldloc(_cultureV); + Call(s_chartolowerM); } /* @@ -1089,6 +1096,9 @@ protected void GenerateFindFirstChar() _textV = DeclareString(); _tempV = DeclareInt(); _temp2V = DeclareInt(); + _cultureV = DeclareCultureInfo(); + + InitLocalCultureInfo(); if (0 != (_anchors & (RegexFCD.Beginning | RegexFCD.Start | RegexFCD.EndZ | RegexFCD.End))) { @@ -1552,6 +1562,14 @@ private LocalBuilder DeclareInt() return _ilg.DeclareLocal(typeof(int)); } + /* + * Declares a local CultureInfo + */ + private LocalBuilder DeclareCultureInfo() + { + return _ilg.DeclareLocal(typeof(CultureInfo)); + } + /* * Declares a local int array */ @@ -1587,6 +1605,7 @@ protected void GenerateGo() _textbegV = DeclareInt(); _textendV = DeclareInt(); _textstartV = DeclareInt(); + _cultureV = DeclareCultureInfo(); // clear some tables @@ -1600,6 +1619,9 @@ protected void GenerateGo() // emit the code! + // cache CultureInfo in local variable which saves excessive thread local storage accesses + InitLocalCultureInfo(); + GenerateForwardSection(); GenerateMiddleSection(); GenerateBacktrackSection(); From 07ce7fc848c680b0b774363f2c5c9b59cb7790db Mon Sep 17 00:00:00 2001 From: Alois Kraus Date: Thu, 18 Oct 2018 23:10:18 +0200 Subject: [PATCH 3/5] Revert "Fix for Regex performance issue when compiled Regex is used with RegexOptions.IgnoreCase or RegexOptions.CultureInvariant." This reverts commit e151e932f86e2ba3b0f97e0877170a219e94d1e2. --- .../tests/RegexCompilationInfoTests.cs | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/src/System.Text.RegularExpressions/tests/RegexCompilationInfoTests.cs b/src/System.Text.RegularExpressions/tests/RegexCompilationInfoTests.cs index 2c7eb0b44707..ab7d512b9acc 100644 --- a/src/System.Text.RegularExpressions/tests/RegexCompilationInfoTests.cs +++ b/src/System.Text.RegularExpressions/tests/RegexCompilationInfoTests.cs @@ -3,7 +3,6 @@ // See the LICENSE file in the project root for more information. using System.Collections.Generic; -using System.Diagnostics; using Xunit; namespace System.Text.RegularExpressions.Tests @@ -66,20 +65,6 @@ public void MatchTimeout_GetSet_Throws(TimeSpan matchTimeout) regexCompilationInfo.MatchTimeout = matchTimeout); } - [Fact] - public void Compile_To_Assembly() - { - var rex = new Regex(".*(/Common.Controls.Wpf;component/Themes/)[^/]*.xaml", RegexOptions.Compiled | RegexOptions.IgnoreCase); - const string noMatch = "This is a long string which contains no matches at all. Sorry for that"; - var sw = Stopwatch.StartNew(); - for(int i=0;i<100*1000;i++) - { - Assert.False(rex.IsMatch(noMatch)); - } - sw.Stop(); - Console.WriteLine($"Did take: {sw.Elapsed.TotalSeconds}s"); - } - [Theory] [MemberData(nameof(MatchTimeout_GetSet_Success_MemberData))] public void MatchTimeout_GetSet_Success(TimeSpan matchTimeout) From a458fc05ff3bc9dedeb298314f41592aa4238994 Mon Sep 17 00:00:00 2001 From: Alois Kraus Date: Mon, 22 Oct 2018 23:52:15 +0200 Subject: [PATCH 4/5] Added TurkishI tests which check compiled and interpreted regular expressions. --- .../tests/RegexCultureTests.cs | 65 +++++++++++++++++++ ...ystem.Text.RegularExpressions.Tests.csproj | 1 + 2 files changed, 66 insertions(+) create mode 100644 src/System.Text.RegularExpressions/tests/RegexCultureTests.cs diff --git a/src/System.Text.RegularExpressions/tests/RegexCultureTests.cs b/src/System.Text.RegularExpressions/tests/RegexCultureTests.cs new file mode 100644 index 000000000000..37bda571a9ea --- /dev/null +++ b/src/System.Text.RegularExpressions/tests/RegexCultureTests.cs @@ -0,0 +1,65 @@ +using System; +using System.Collections.Generic; +using System.Globalization; +using System.Text; +using Xunit; + +namespace System.Text.RegularExpressions.Tests +{ + public class RegexCultureTests + { + /// + /// See https://en.wikipedia.org/wiki/Dotted_and_dotless_I + /// + [Fact] + public void TurkishI_Is_Differently_LowerUpperCased_In_Turkish_Culture() + { + var turkish = new CultureInfo("tr-TR"); + string input = "Iıİi"; + + var cultInvariantRegex = Create(input, CultureInfo.InvariantCulture, RegexOptions.IgnoreCase | RegexOptions.CultureInvariant); + var turkishRegex = Create(input, turkish, RegexOptions.IgnoreCase); + + // same input and regex does match so far so good + Assert.All(cultInvariantRegex, rex => Assert.Equal(true, rex.IsMatch(input)) ); + + // when the Regex was created with a turkish locale the lower cased turkish version will + // no longer match the input string which contains upper and lower case iiiis hence even the input string + // will no longer match + Assert.All(turkishRegex, rex => Assert.Equal(false, rex.IsMatch(input))); + + // Now comes the tricky part depending on the use locale in ToUpper the results differ + // Hence the regular expression will not match if different locales were used + Assert.All(cultInvariantRegex, rex => Assert.Equal(true, rex.IsMatch(input.ToLowerInvariant()))); + Assert.All(cultInvariantRegex, rex => Assert.Equal(false, rex.IsMatch(input.ToLower(turkish)))); + + Assert.All(turkishRegex, rex => Assert.Equal(false, rex.IsMatch(input.ToLowerInvariant()))); + Assert.All(turkishRegex, rex => Assert.Equal(true, rex.IsMatch(input.ToLower(turkish)))); + } + + /// + /// Create regular expression once compiled and once interpreted to check if both behave the same + /// + /// Input regex string + /// thread culture to use when creating the regex + /// Additional regex options + /// + Regex[] Create(string input, CultureInfo info, RegexOptions additional) + { + CultureInfo current = CultureInfo.CurrentCulture; + try + { + CultureInfo.CurrentCulture = info; + + // When RegexOptions.IgnoreCase is supplied the current thread culture is used to lowercase the input string. + // Except if RegexOptions.CultureInvariant is additionally added locale dependent effects on the generated code or state machine may happen. + var localizedRegex = new Regex[] { new Regex(input, additional), new Regex(input, RegexOptions.Compiled | additional) }; + return localizedRegex; + } + finally + { + CultureInfo.CurrentCulture = current; + } + } + } +} diff --git a/src/System.Text.RegularExpressions/tests/System.Text.RegularExpressions.Tests.csproj b/src/System.Text.RegularExpressions/tests/System.Text.RegularExpressions.Tests.csproj index f51615f1c434..9ff04485d62d 100644 --- a/src/System.Text.RegularExpressions/tests/System.Text.RegularExpressions.Tests.csproj +++ b/src/System.Text.RegularExpressions/tests/System.Text.RegularExpressions.Tests.csproj @@ -25,6 +25,7 @@ + System\Text\RegularExpressions\RegexParseError.cs From 2b18982a1d377d79a97c8ad185eabf73ef547529 Mon Sep 17 00:00:00 2001 From: Alois-xx Date: Thu, 25 Oct 2018 13:52:30 +0200 Subject: [PATCH 5/5] Removed var of test --- src/System.Text.RegularExpressions/tests/RegexCultureTests.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/System.Text.RegularExpressions/tests/RegexCultureTests.cs b/src/System.Text.RegularExpressions/tests/RegexCultureTests.cs index 37bda571a9ea..9097c2f02e7f 100644 --- a/src/System.Text.RegularExpressions/tests/RegexCultureTests.cs +++ b/src/System.Text.RegularExpressions/tests/RegexCultureTests.cs @@ -17,8 +17,8 @@ public void TurkishI_Is_Differently_LowerUpperCased_In_Turkish_Culture() var turkish = new CultureInfo("tr-TR"); string input = "Iıİi"; - var cultInvariantRegex = Create(input, CultureInfo.InvariantCulture, RegexOptions.IgnoreCase | RegexOptions.CultureInvariant); - var turkishRegex = Create(input, turkish, RegexOptions.IgnoreCase); + Regex[] cultInvariantRegex = Create(input, CultureInfo.InvariantCulture, RegexOptions.IgnoreCase | RegexOptions.CultureInvariant); + Regex[] turkishRegex = Create(input, turkish, RegexOptions.IgnoreCase); // same input and regex does match so far so good Assert.All(cultInvariantRegex, rex => Assert.Equal(true, rex.IsMatch(input)) );