diff --git a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 5f585ba7d343..3da864922acd 100644 --- a/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -72,6 +72,7 @@ internal abstract class RegexCompiler private LocalBuilder _tempV; private LocalBuilder _temp2V; private LocalBuilder _temp3V; + private LocalBuilder _cultureV; // current culture is cached in local variable to prevent many thread local storage accesses for CultureInfo.CurrentCulture protected RegexCode _code; // the RegexCode object (used for debugging only) protected int[] _codes; // the RegexCodes being translated @@ -961,14 +962,20 @@ private void Advance() _ilg.Emit(OpCodes.Br, AdvanceLabel()); } - private void CallToLower() + private void InitLocalCultureInfo() { if ((_options & RegexOptions.CultureInvariant) != 0) Call(s_getInvariantCulture); else Call(s_getCurrentCulture); - Call(s_chartolowerM); + Stloc(_cultureV); + } + + private void CallToLower() + { + Ldloc(_cultureV); + Call(s_chartolowerM); } /* @@ -1089,6 +1096,9 @@ protected void GenerateFindFirstChar() _textV = DeclareString(); _tempV = DeclareInt(); _temp2V = DeclareInt(); + _cultureV = DeclareCultureInfo(); + + InitLocalCultureInfo(); if (0 != (_anchors & (RegexFCD.Beginning | RegexFCD.Start | RegexFCD.EndZ | RegexFCD.End))) { @@ -1552,6 +1562,14 @@ private LocalBuilder DeclareInt() return _ilg.DeclareLocal(typeof(int)); } + /* + * Declares a local CultureInfo + */ + private LocalBuilder DeclareCultureInfo() + { + return _ilg.DeclareLocal(typeof(CultureInfo)); + } + /* * Declares a local int array */ @@ -1587,6 +1605,7 @@ protected void GenerateGo() _textbegV = DeclareInt(); _textendV = DeclareInt(); _textstartV = DeclareInt(); + _cultureV = DeclareCultureInfo(); // clear some tables @@ -1600,6 +1619,9 @@ protected void GenerateGo() // emit the code! + // cache CultureInfo in local variable which saves excessive thread local storage accesses + InitLocalCultureInfo(); + GenerateForwardSection(); GenerateMiddleSection(); GenerateBacktrackSection(); diff --git a/src/System.Text.RegularExpressions/tests/RegexCultureTests.cs b/src/System.Text.RegularExpressions/tests/RegexCultureTests.cs new file mode 100644 index 000000000000..9097c2f02e7f --- /dev/null +++ b/src/System.Text.RegularExpressions/tests/RegexCultureTests.cs @@ -0,0 +1,65 @@ +using System; +using System.Collections.Generic; +using System.Globalization; +using System.Text; +using Xunit; + +namespace System.Text.RegularExpressions.Tests +{ + public class RegexCultureTests + { + /// + /// See https://en.wikipedia.org/wiki/Dotted_and_dotless_I + /// + [Fact] + public void TurkishI_Is_Differently_LowerUpperCased_In_Turkish_Culture() + { + var turkish = new CultureInfo("tr-TR"); + string input = "Iıİi"; + + Regex[] cultInvariantRegex = Create(input, CultureInfo.InvariantCulture, RegexOptions.IgnoreCase | RegexOptions.CultureInvariant); + Regex[] turkishRegex = Create(input, turkish, RegexOptions.IgnoreCase); + + // same input and regex does match so far so good + Assert.All(cultInvariantRegex, rex => Assert.Equal(true, rex.IsMatch(input)) ); + + // when the Regex was created with a turkish locale the lower cased turkish version will + // no longer match the input string which contains upper and lower case iiiis hence even the input string + // will no longer match + Assert.All(turkishRegex, rex => Assert.Equal(false, rex.IsMatch(input))); + + // Now comes the tricky part depending on the use locale in ToUpper the results differ + // Hence the regular expression will not match if different locales were used + Assert.All(cultInvariantRegex, rex => Assert.Equal(true, rex.IsMatch(input.ToLowerInvariant()))); + Assert.All(cultInvariantRegex, rex => Assert.Equal(false, rex.IsMatch(input.ToLower(turkish)))); + + Assert.All(turkishRegex, rex => Assert.Equal(false, rex.IsMatch(input.ToLowerInvariant()))); + Assert.All(turkishRegex, rex => Assert.Equal(true, rex.IsMatch(input.ToLower(turkish)))); + } + + /// + /// Create regular expression once compiled and once interpreted to check if both behave the same + /// + /// Input regex string + /// thread culture to use when creating the regex + /// Additional regex options + /// + Regex[] Create(string input, CultureInfo info, RegexOptions additional) + { + CultureInfo current = CultureInfo.CurrentCulture; + try + { + CultureInfo.CurrentCulture = info; + + // When RegexOptions.IgnoreCase is supplied the current thread culture is used to lowercase the input string. + // Except if RegexOptions.CultureInvariant is additionally added locale dependent effects on the generated code or state machine may happen. + var localizedRegex = new Regex[] { new Regex(input, additional), new Regex(input, RegexOptions.Compiled | additional) }; + return localizedRegex; + } + finally + { + CultureInfo.CurrentCulture = current; + } + } + } +} diff --git a/src/System.Text.RegularExpressions/tests/System.Text.RegularExpressions.Tests.csproj b/src/System.Text.RegularExpressions/tests/System.Text.RegularExpressions.Tests.csproj index f51615f1c434..9ff04485d62d 100644 --- a/src/System.Text.RegularExpressions/tests/System.Text.RegularExpressions.Tests.csproj +++ b/src/System.Text.RegularExpressions/tests/System.Text.RegularExpressions.Tests.csproj @@ -25,6 +25,7 @@ + System\Text\RegularExpressions\RegexParseError.cs