Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit be97a13

Browse files
committed
ICU-22942 MF2 ICU4C: NFC-normalize names and keys according to spec
1 parent 8bdb306 commit be97a13

11 files changed

+146
-20
lines changed

icu4c/source/i18n/messageformat2.cpp

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,10 @@
1111
#include "unicode/messageformat2_data_model.h"
1212
#include "unicode/messageformat2_formattable.h"
1313
#include "unicode/messageformat2.h"
14+
#include "unicode/normalizer2.h"
1415
#include "unicode/unistr.h"
1516
#include "messageformat2_allocation.h"
17+
#include "messageformat2_checker.h"
1618
#include "messageformat2_evaluation.h"
1719
#include "messageformat2_macros.h"
1820

@@ -37,7 +39,7 @@ static Formattable evalLiteral(const Literal& lit) {
3739
// The fallback for a variable name is itself.
3840
UnicodeString str(DOLLAR);
3941
str += var;
40-
const Formattable* val = context.getGlobal(var, errorCode);
42+
const Formattable* val = context.getGlobal(*this, var, errorCode);
4143
if (U_SUCCESS(errorCode)) {
4244
return (FormattedPlaceholder(*val, str));
4345
}
@@ -52,9 +54,9 @@ static Formattable evalLiteral(const Literal& lit) {
5254
}
5355

5456
[[nodiscard]] FormattedPlaceholder MessageFormatter::formatOperand(const Environment& env,
55-
const Operand& rand,
56-
MessageContext& context,
57-
UErrorCode &status) const {
57+
const Operand& rand,
58+
MessageContext& context,
59+
UErrorCode &status) const {
5860
if (U_FAILURE(status)) {
5961
return {};
6062
}
@@ -71,15 +73,19 @@ static Formattable evalLiteral(const Literal& lit) {
7173
// Eager vs. lazy evaluation is an open issue:
7274
// see https://github.com/unicode-org/message-format-wg/issues/299
7375

76+
// NFC-normalize the variable name. See
77+
// https://github.com/unicode-org/message-format-wg/blob/main/spec/syntax.md#names-and-identifiers
78+
const VariableName normalized = normalizeNFC(var);
79+
7480
// Look up the variable in the environment
75-
if (env.has(var)) {
81+
if (env.has(normalized)) {
7682
// `var` is a local -- look it up
77-
const Closure& rhs = env.lookup(var);
83+
const Closure& rhs = env.lookup(normalized);
7884
// Format the expression using the environment from the closure
7985
return formatExpression(rhs.getEnv(), rhs.getExpr(), context, status);
8086
}
8187
// Variable wasn't found in locals -- check if it's global
82-
FormattedPlaceholder result = evalArgument(var, context, status);
88+
FormattedPlaceholder result = evalArgument(normalized, context, status);
8389
if (status == U_ILLEGAL_ARGUMENT_ERROR) {
8490
status = U_ZERO_ERROR;
8591
// Unbound variable -- set a resolution error
@@ -761,6 +767,7 @@ void MessageFormatter::formatSelectors(MessageContext& context, const Environmen
761767
UnicodeString MessageFormatter::formatToString(const MessageArguments& arguments, UErrorCode &status) {
762768
EMPTY_ON_ERROR(status);
763769

770+
764771
// Create a new environment that will store closures for all local variables
765772
Environment* env = Environment::create(status);
766773
// Create a new context with the given arguments and the `errors` structure
@@ -813,12 +820,14 @@ void MessageFormatter::check(MessageContext& context, const Environment& localEn
813820

814821
// Check that variable is in scope
815822
const VariableName& var = rand.asVariable();
823+
UnicodeString normalized = normalizeNFC(var);
824+
816825
// Check local scope
817-
if (localEnv.has(var)) {
826+
if (localEnv.has(normalized)) {
818827
return;
819828
}
820829
// Check global scope
821-
context.getGlobal(var, status);
830+
context.getGlobal(*this, normalized, status);
822831
if (status == U_ILLEGAL_ARGUMENT_ERROR) {
823832
status = U_ZERO_ERROR;
824833
context.getErrors().setUnresolvedVariable(var, status);
@@ -855,7 +864,10 @@ void MessageFormatter::checkDeclarations(MessageContext& context, Environment*&
855864
// memoizing the value of localEnv up to this point
856865

857866
// Add the LHS to the environment for checking the next declaration
858-
env = Environment::create(decl.getVariable(), Closure(rhs, *env), env, status);
867+
env = Environment::create(normalizeNFC(decl.getVariable()),
868+
Closure(rhs, *env),
869+
env,
870+
status);
859871
CHECK_ERROR(status);
860872
}
861873
}

icu4c/source/i18n/messageformat2_arguments.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,10 @@
77

88
#if !UCONFIG_NO_MF2
99

10+
#include "unicode/messageformat2.h"
1011
#include "unicode/messageformat2_arguments.h"
1112
#include "unicode/messageformat2_data_model_names.h"
13+
#include "messageformat2_evaluation.h"
1214
#include "uvector.h" // U_ASSERT
1315

1416
U_NAMESPACE_BEGIN
@@ -22,11 +24,15 @@ namespace message2 {
2224

2325
using Arguments = MessageArguments;
2426

25-
const Formattable* Arguments::getArgument(const VariableName& arg, UErrorCode& errorCode) const {
27+
const Formattable* Arguments::getArgument(const MessageFormatter& context,
28+
const VariableName& arg,
29+
UErrorCode& errorCode) const {
2630
if (U_SUCCESS(errorCode)) {
2731
U_ASSERT(argsLen == 0 || arguments.isValid());
2832
for (int32_t i = 0; i < argsLen; i++) {
29-
if (argumentNames[i] == arg) {
33+
UnicodeString normalized = context.normalizeNFC(argumentNames[i]);
34+
// arg already assumed to be normalized
35+
if (normalized == arg) {
3036
return &arguments[i];
3137
}
3238
}

icu4c/source/i18n/messageformat2_checker.cpp

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,10 @@
77

88
#if !UCONFIG_NO_MF2
99

10+
#include "unicode/messageformat2.h"
1011
#include "messageformat2_allocation.h"
1112
#include "messageformat2_checker.h"
13+
#include "messageformat2_evaluation.h"
1214
#include "messageformat2_macros.h"
1315
#include "uvector.h" // U_ASSERT
1416

@@ -104,6 +106,13 @@ TypeEnvironment::~TypeEnvironment() {}
104106

105107
// ---------------------
106108

109+
UnicodeString Checker::normalizeNFC(const Key& k) const {
110+
if (k.isWildcard()) {
111+
return UnicodeString("*");
112+
}
113+
return context.normalizeNFC(k.asLiteral().unquoted());
114+
}
115+
107116
static bool areDefaultKeys(const Key* keys, int32_t len) {
108117
U_ASSERT(len > 0);
109118
for (int32_t i = 0; i < len; i++) {
@@ -185,7 +194,7 @@ void Checker::checkVariants(UErrorCode& status) {
185194
// This variant was already checked,
186195
// so we know keys1.len == len
187196
for (int32_t kk = 0; kk < len; kk++) {
188-
if (!(keys[kk] == keys1[kk])) {
197+
if (!(normalizeNFC(keys[kk]) == normalizeNFC(keys1[kk]))) {
189198
allEqual = false;
190199
break;
191200
}

icu4c/source/i18n/messageformat2_checker.h

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,14 +56,19 @@ namespace message2 {
5656
// an explicit declaration
5757
}; // class TypeEnvironment
5858

59+
class MessageFormatter;
60+
5961
// Checks a data model for semantic errors
6062
// (Errors are defined in https://github.com/unicode-org/message-format-wg/blob/main/spec/formatting.md )
6163
class Checker {
6264
public:
6365
void check(UErrorCode&);
64-
Checker(const MFDataModel& m, StaticErrors& e) : dataModel(m), errors(e) {}
66+
Checker(const MFDataModel& d, StaticErrors& e, const MessageFormatter& mf)
67+
: dataModel(d), errors(e), context(mf) {}
6568
private:
6669

70+
UnicodeString normalizeNFC(const Key&) const;
71+
6772
void requireAnnotated(const TypeEnvironment&, const Expression&, UErrorCode&);
6873
void addFreeVars(TypeEnvironment& t, const Operand&, UErrorCode&);
6974
void addFreeVars(TypeEnvironment& t, const Operator&, UErrorCode&);
@@ -78,6 +83,9 @@ namespace message2 {
7883
void check(const Pattern&);
7984
const MFDataModel& dataModel;
8085
StaticErrors& errors;
86+
87+
// Used for NFC normalization
88+
const MessageFormatter& context;
8189
}; // class Checker
8290

8391
} // namespace message2

icu4c/source/i18n/messageformat2_evaluation.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -190,13 +190,16 @@ PrioritizedVariant::~PrioritizedVariant() {}
190190
errors.checkErrors(status);
191191
}
192192

193-
const Formattable* MessageContext::getGlobal(const VariableName& v, UErrorCode& errorCode) const {
194-
return arguments.getArgument(v, errorCode);
193+
const Formattable* MessageContext::getGlobal(const MessageFormatter& context,
194+
const VariableName& v,
195+
UErrorCode& errorCode) const {
196+
return arguments.getArgument(context, v, errorCode);
195197
}
196198

197199
MessageContext::MessageContext(const MessageArguments& args,
198200
const StaticErrors& e,
199201
UErrorCode& status) : arguments(args), errors(e, status) {}
202+
200203
MessageContext::~MessageContext() {}
201204

202205
} // namespace message2

icu4c/source/i18n/messageformat2_evaluation.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,11 +174,15 @@ namespace message2 {
174174
// The context contains all the information needed to process
175175
// an entire message: arguments, formatter cache, and error list
176176

177+
class MessageFormatter;
178+
177179
class MessageContext : public UMemory {
178180
public:
179181
MessageContext(const MessageArguments&, const StaticErrors&, UErrorCode&);
180182

181-
const Formattable* getGlobal(const VariableName&, UErrorCode&) const;
183+
const Formattable* getGlobal(const MessageFormatter&,
184+
const VariableName&,
185+
UErrorCode&) const;
182186

183187
// If any errors were set, update `status` accordingly
184188
void checkErrors(UErrorCode& status) const;
@@ -191,6 +195,7 @@ namespace message2 {
191195
const MessageArguments& arguments; // External message arguments
192196
// Errors accumulated during parsing/formatting
193197
DynamicErrors errors;
198+
194199
}; // class MessageContext
195200

196201
} // namespace message2

icu4c/source/i18n/messageformat2_formatter.cpp

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,24 @@ namespace message2 {
116116

117117
// MessageFormatter
118118

119+
// Returns the NFC-normalized version of s, returning s itself
120+
// if it's already normalized.
121+
UnicodeString MessageFormatter::normalizeNFC(const UnicodeString& s) const {
122+
UErrorCode status = U_ZERO_ERROR;
123+
// Check if string is already normalized
124+
UNormalizationCheckResult result = nfcNormalizer->quickCheck(s, status);
125+
// If so, return it
126+
if (U_SUCCESS(status) && result == UNORM_YES) {
127+
return s;
128+
}
129+
// Otherwise, normalize it
130+
UnicodeString normalized = nfcNormalizer->normalize(s, status);
131+
if (U_FAILURE(status)) {
132+
return {};
133+
}
134+
return normalized;
135+
}
136+
119137
MessageFormatter::MessageFormatter(const MessageFormatter::Builder& builder, UErrorCode &success) : locale(builder.locale), customMFFunctionRegistry(builder.customMFFunctionRegistry) {
120138
CHECK_ERROR(success);
121139

@@ -163,14 +181,16 @@ namespace message2 {
163181
errors = errorsNew.orphan();
164182
}
165183

184+
nfcNormalizer = Normalizer2::getNFCInstance(success);
185+
166186
// Note: we currently evaluate variables lazily,
167187
// without memoization. This call is still necessary
168188
// to check out-of-scope uses of local variables in
169189
// right-hand sides (unresolved variable errors can
170190
// only be checked when arguments are known)
171191

172192
// Check for resolution errors
173-
Checker(dataModel, *errors).check(success);
193+
Checker(dataModel, *errors, *this).check(success);
174194
}
175195

176196
void MessageFormatter::cleanup() noexcept {
@@ -191,6 +211,7 @@ namespace message2 {
191211
signalErrors = other.signalErrors;
192212
errors = other.errors;
193213
other.errors = nullptr;
214+
nfcNormalizer = other.nfcNormalizer;
194215
return *this;
195216
}
196217

icu4c/source/i18n/unicode/messageformat2.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include "unicode/messageformat2_arguments.h"
2121
#include "unicode/messageformat2_data_model.h"
2222
#include "unicode/messageformat2_function_registry.h"
23+
#include "unicode/normalizer2.h"
2324
#include "unicode/unistr.h"
2425

2526
#ifndef U_HIDE_DEPRECATED_API
@@ -325,6 +326,8 @@ namespace message2 {
325326

326327
private:
327328
friend class Builder;
329+
friend class Checker;
330+
friend class MessageArguments;
328331
friend class MessageContext;
329332

330333
MessageFormatter(const MessageFormatter::Builder& builder, UErrorCode &status);
@@ -352,6 +355,9 @@ namespace message2 {
352355
void resolvePreferences(MessageContext&, UVector&, UVector&, UErrorCode&) const;
353356

354357
// Formatting methods
358+
359+
// Used for normalizing variable names and keys for comparison
360+
UnicodeString normalizeNFC(const UnicodeString&) const;
355361
[[nodiscard]] FormattedPlaceholder formatLiteral(const data_model::Literal&) const;
356362
void formatPattern(MessageContext&, const Environment&, const data_model::Pattern&, UErrorCode&, UnicodeString&) const;
357363
// Formats a call to a formatting function
@@ -445,6 +451,10 @@ namespace message2 {
445451
// formatting methods return best-effort output.
446452
// The default is false.
447453
bool signalErrors = false;
454+
455+
// Used for implementing normalizeNFC()
456+
const Normalizer2* nfcNormalizer = nullptr;
457+
448458
}; // class MessageFormatter
449459

450460
} // namespace message2

icu4c/source/i18n/unicode/messageformat2_arguments.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ template class U_I18N_API LocalArray<message2::Formattable>;
4343

4444
namespace message2 {
4545

46-
class MessageContext;
46+
class MessageFormatter;
4747

4848
// Arguments
4949
// ----------
@@ -112,7 +112,9 @@ namespace message2 {
112112
private:
113113
friend class MessageContext;
114114

115-
const Formattable* getArgument(const data_model::VariableName&, UErrorCode&) const;
115+
const Formattable* getArgument(const MessageFormatter&,
116+
const data_model::VariableName&,
117+
UErrorCode&) const;
116118

117119
// Avoids using Hashtable so that code constructing a Hashtable
118120
// doesn't have to appear in this header file

icu4c/source/test/intltest/messageformat2test_read_json.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,9 @@ void TestMessageFormat2::jsonTestsFromFiles(IcuTestErrorCode& errorCode) {
309309
runTestsFromJsonFile(*this, "spec/functions/time.json", errorCode);
310310

311311
// Other tests (non-spec)
312+
// TODO: Delete this file after https://github.com/unicode-org/message-format-wg/pull/904
313+
// lands and the tests here are updated from the spec repo
314+
runTestsFromJsonFile(*this, "normalization.json", errorCode);
312315
runTestsFromJsonFile(*this, "more-functions.json", errorCode);
313316
runTestsFromJsonFile(*this, "valid-tests.json", errorCode);
314317
runTestsFromJsonFile(*this, "resolution-errors.json", errorCode);

testdata/message2/normalization.json

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
{
2+
"$schema": "https://raw.githubusercontent.com/unicode-org/message-format-wg/main/test/schemas/v0/tests.schema.json",
3+
"scenario": "Syntax",
4+
"description": "Test cases that do not depend on any registry definitions.",
5+
"defaultTestProperties": {
6+
"locale": "en-US"
7+
},
8+
"tests": [
9+
{
10+
"description": "NFC: literals are not normalized",
11+
"src": "\u1E0A\u0323",
12+
"exp": "\u1E0A\u0323"
13+
},
14+
{
15+
"description": "NFC: variables are compared to each other as-if normalized; decl is non-normalized, use is",
16+
"src": ".local $\u0044\u0323\u0307 = {foo} {{{$\u1E0c\u0307}}}",
17+
"exp": "foo"
18+
},
19+
{
20+
"description": "NFC: variables are compared to each other as-if normalized; decl is normalized, use isn't",
21+
"src": ".local $\u1E0c\u0307 = {foo} {{{$\u0044\u0323\u0307}}}",
22+
"exp": "foo"
23+
},
24+
{
25+
"description": "NFC: variables are compared to each other as-if normalized; decl is normalized, use isn't",
26+
"src": ".input {$\u1E0c\u0307} {{{$\u0044\u0323\u0307}}}",
27+
"params": [{"name": "\u1E0c\u0307", "value": "foo"}],
28+
"exp": "foo"
29+
},
30+
{
31+
"description": "NFC: variables are compared to each other as-if normalized; decl is non-normalized, use is",
32+
"src": ".input {$\u0044\u0323\u0307} {{{$\u1E0c\u0307}}}",
33+
"params": [{"name": "\u0044\u0323\u0307", "value": "foo"}],
34+
"exp": "foo"
35+
},
36+
{
37+
"description": "NFC: keys are normalized",
38+
"src": ".local $x = {\u1E0A\u0323 :string} .match {$x} \u1E0A\u0323 {{Not normalized}} \u1E0C\u0307 {{Normalized}} * {{Wrong}}",
39+
"expErrors": [{"type": "duplicate-variant"}]
40+
},
41+
{
42+
"description": "NFC: keys are normalized",
43+
"src": ".local $x = {\u1E0A\u0323 :string} .match {$x} \u1E0C\u0307 {{Right}} * {{Wrong}}",
44+
"exp": "Right"
45+
}
46+
]
47+
}

0 commit comments

Comments
 (0)