Thanks to visit codestin.com
Credit goes to github.com

Skip to content

<regex>: Equivalence classes have unexpected behavior with std::wregex #5435

@StephanTLavavej

Description

@StephanTLavavej

Repros with VS 2022 17.14 Preview 4 with microsoft/STL main, including #5392. Tracked by internal VSO-127463 / AB#127463 , originally reported by an external user through the defunct Microsoft Connect site on 2015-06-02.

C:\Temp>type meow.cpp
#include <locale>
#include <print>
#include <regex>
#include <string>
using namespace std;

[[nodiscard]] string escape_wide(const wstring& wstr) {
    string ret{R"(L")"};
    for (const auto& wch : wstr) {
        ret += format(R"(\x{:x})", static_cast<unsigned short>(wch));
    }
    ret += R"(")";
    return ret;
}

void display_result(const wstring& wstr, const wstring& pattern) {
    const locale loc{"fr-FR"};

    wregex rgx;
    rgx.imbue(loc);
    rgx.assign(pattern, regex_constants::icase | regex_constants::collate);

    const bool result = regex_match(wstr, rgx);

    regex_traits<wchar_t> tr;
    tr.imbue(loc);
    const wstring primary_sort_key = tr.transform_primary(wstr.begin(), wstr.end());

    println("wstr: {}; result: {:>5}; primary_sort_key: {}", escape_wide(wstr), result, escape_wide(primary_sort_key));
}

int main() {
    display_result(L"E", L"[[=e=]]");
    display_result(L"\u00C8", L"[[=e=]]"); // LATIN CAPITAL LETTER E WITH GRAVE
    display_result(L"\u00C9", L"[[=e=]]"); // LATIN CAPITAL LETTER E WITH ACUTE
    display_result(L"\u00CA", L"[[=e=]]"); // LATIN CAPITAL LETTER E WITH CIRCUMFLEX

    display_result(L"e", L"[[=e=]]");
    display_result(L"\u00E8", L"[[=e=]]"); // LATIN SMALL LETTER E WITH GRAVE
    display_result(L"\u00E9", L"[[=e=]]"); // LATIN SMALL LETTER E WITH ACUTE
    display_result(L"\u00EA", L"[[=e=]]"); // LATIN SMALL LETTER E WITH CIRCUMFLEX
}
C:\Temp>cl /EHsc /nologo /W4 /std:c++latest /MTd /Od meow.cpp && meow
meow.cpp
wstr: L"\x45"; result:  true; primary_sort_key: L"\xe\x21\x1\x1\x1\x1\x0"
wstr: L"\xc8"; result: false; primary_sort_key: L"\xe\x21\x1\xf\x1\x1\x1\x0"
wstr: L"\xc9"; result: false; primary_sort_key: L"\xe\x21\x1\xe\x1\x1\x1\x0"
wstr: L"\xca"; result: false; primary_sort_key: L"\xe\x21\x1\x12\x1\x1\x1\x0"
wstr: L"\x65"; result:  true; primary_sort_key: L"\xe\x21\x1\x1\x1\x1\x0"
wstr: L"\xe8"; result: false; primary_sort_key: L"\xe\x21\x1\xf\x1\x1\x1\x0"
wstr: L"\xe9"; result: false; primary_sort_key: L"\xe\x21\x1\xe\x1\x1\x1\x0"
wstr: L"\xea"; result: false; primary_sort_key: L"\xe\x21\x1\x12\x1\x1\x1\x0"

The user expects regex_match to always return true here.

I don't understand why LCMapStringEx with LCMAP_SORTKEY is producing these primary sort keys. Are we supposed to be passing extra flags to ignore diacritics?

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't workingfixedSomething works now, yay!regexmeow is a substring of homeowner

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions