Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 4e80636

Browse files
cor3ntinAaronBallman
authored andcommitted
Implement P1949
This adds the Unicode 13 data for XID_Start and XID_Continue. The definition of valid identifier is changed in all C++ modes as P1949 (https://wg21.link/p1949) was accepted by WG21 as a defect report.
1 parent f22e586 commit 4e80636

16 files changed

Lines changed: 516 additions & 241 deletions

clang/include/clang/Basic/CharInfo.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,11 @@ LLVM_READNONE inline bool isASCII(char c) {
4343
return static_cast<unsigned char>(c) <= 127;
4444
}
4545

46+
LLVM_READNONE inline bool isASCII(unsigned char c) { return c <= 127; }
47+
48+
/// Returns true if this is an ASCII character.
49+
LLVM_READNONE inline bool isASCII(uint32_t c) { return c <= 127; }
50+
4651
/// Returns true if this is a valid first character of a C identifier,
4752
/// which is [a-zA-Z_].
4853
LLVM_READONLY inline bool isIdentifierHead(unsigned char c,

clang/include/clang/Basic/DiagnosticLexKinds.td

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -113,8 +113,10 @@ def warn_four_char_character_literal : Warning<
113113
// Unicode and UCNs
114114
def err_invalid_utf8 : Error<
115115
"source file is not valid UTF-8">;
116-
def err_non_ascii : Error<
117-
"non-ASCII characters are not allowed outside of literals and identifiers">;
116+
def err_character_not_allowed : Error<
117+
"unexpected character <U+%0>">;
118+
def err_character_not_allowed_identifier : Error<
119+
"character <U+%0> not allowed %select{in|at the start of}1 an identifier">;
118120
def ext_unicode_whitespace : ExtWarn<
119121
"treating Unicode character as whitespace">,
120122
InGroup<DiagGroup<"unicode-whitespace">>;
@@ -150,9 +152,6 @@ def warn_c99_compat_unicode_id : Warning<
150152
"%select{using this character in an identifier|starting an identifier with "
151153
"this character}0 is incompatible with C99">,
152154
InGroup<C99Compat>, DefaultIgnore;
153-
def warn_cxx98_compat_unicode_id : Warning<
154-
"using this character in an identifier is incompatible with C++98">,
155-
InGroup<CXX98Compat>, DefaultIgnore;
156155

157156
def warn_cxx98_compat_literal_ucn_escape_basic_scs : Warning<
158157
"specifying character '%0' with a universal character name "

clang/lib/Lex/Lexer.cpp

Lines changed: 91 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1446,19 +1446,30 @@ void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {
14461446
IsAtPhysicalStartOfLine = StartOfLine;
14471447
}
14481448

1449+
static bool isUnicodeWhitespace(uint32_t Codepoint) {
1450+
static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
1451+
UnicodeWhitespaceCharRanges);
1452+
return UnicodeWhitespaceChars.contains(Codepoint);
1453+
}
1454+
14491455
static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) {
14501456
if (LangOpts.AsmPreprocessor) {
14511457
return false;
14521458
} else if (LangOpts.DollarIdents && '$' == C) {
14531459
return true;
1454-
} else if (LangOpts.CPlusPlus11 || LangOpts.C11) {
1460+
} else if (LangOpts.CPlusPlus) {
1461+
// A non-leading codepoint must have the XID_Continue property.
1462+
// XIDContinueRanges doesn't contains characters also in XIDStartRanges,
1463+
// so we need to check both tables.
1464+
// '_' doesn't have the XID_Continue property but is allowed in C++.
1465+
static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1466+
static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges);
1467+
return C == '_' || XIDStartChars.contains(C) ||
1468+
XIDContinueChars.contains(C);
1469+
} else if (LangOpts.C11) {
14551470
static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
14561471
C11AllowedIDCharRanges);
14571472
return C11AllowedIDChars.contains(C);
1458-
} else if (LangOpts.CPlusPlus) {
1459-
static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars(
1460-
CXX03AllowedIDCharRanges);
1461-
return CXX03AllowedIDChars.contains(C);
14621473
} else {
14631474
static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
14641475
C99AllowedIDCharRanges);
@@ -1467,20 +1478,24 @@ static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) {
14671478
}
14681479

14691480
static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) {
1470-
assert(isAllowedIDChar(C, LangOpts));
14711481
if (LangOpts.AsmPreprocessor) {
14721482
return false;
1473-
} else if (LangOpts.CPlusPlus11 || LangOpts.C11) {
1483+
}
1484+
if (LangOpts.CPlusPlus) {
1485+
static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1486+
// '_' doesn't have the XID_Start property but is allowed in C++.
1487+
return C == '_' || XIDStartChars.contains(C);
1488+
}
1489+
if (!isAllowedIDChar(C, LangOpts))
1490+
return false;
1491+
if (LangOpts.C11) {
14741492
static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
14751493
C11DisallowedInitialIDCharRanges);
14761494
return !C11DisallowedInitialIDChars.contains(C);
1477-
} else if (LangOpts.CPlusPlus) {
1478-
return true;
1479-
} else {
1480-
static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1481-
C99DisallowedInitialIDCharRanges);
1482-
return !C99DisallowedInitialIDChars.contains(C);
14831495
}
1496+
static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1497+
C99DisallowedInitialIDCharRanges);
1498+
return !C99DisallowedInitialIDChars.contains(C);
14841499
}
14851500

14861501
static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
@@ -1512,16 +1527,6 @@ static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
15121527
<< CannotStartIdentifier;
15131528
}
15141529
}
1515-
1516-
// Check C++98 compatibility.
1517-
if (!Diags.isIgnored(diag::warn_cxx98_compat_unicode_id, Range.getBegin())) {
1518-
static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars(
1519-
CXX03AllowedIDCharRanges);
1520-
if (!CXX03AllowedIDChars.contains(C)) {
1521-
Diags.Report(Range.getBegin(), diag::warn_cxx98_compat_unicode_id)
1522-
<< Range;
1523-
}
1524-
}
15251530
}
15261531

15271532
/// After encountering UTF-8 character C and interpreting it as an identifier
@@ -1608,14 +1613,55 @@ static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C,
16081613
}
16091614
}
16101615

1616+
static void diagnoseInvalidUnicodeCodepointInIdentifier(
1617+
DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint,
1618+
CharSourceRange Range, bool IsFirst) {
1619+
if (isASCII(CodePoint))
1620+
return;
1621+
1622+
bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts);
1623+
bool IsIDContinue = IsIDStart || isAllowedIDChar(CodePoint, LangOpts);
1624+
1625+
if ((IsFirst && IsIDStart) || (!IsFirst && IsIDContinue))
1626+
return;
1627+
1628+
bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue;
1629+
1630+
llvm::SmallString<5> CharBuf;
1631+
llvm::raw_svector_ostream CharOS(CharBuf);
1632+
llvm::write_hex(CharOS, CodePoint, llvm::HexPrintStyle::Upper, 4);
1633+
1634+
if (!IsFirst || InvalidOnlyAtStart) {
1635+
Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier)
1636+
<< Range << CharBuf << int(InvalidOnlyAtStart)
1637+
<< FixItHint::CreateRemoval(Range);
1638+
} else {
1639+
Diags.Report(Range.getBegin(), diag::err_character_not_allowed)
1640+
<< Range << CharBuf << FixItHint::CreateRemoval(Range);
1641+
}
1642+
}
1643+
16111644
bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
16121645
Token &Result) {
16131646
const char *UCNPtr = CurPtr + Size;
16141647
uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr);
1615-
if (CodePoint == 0 || !isAllowedIDChar(CodePoint, LangOpts))
1648+
if (CodePoint == 0) {
16161649
return false;
1650+
}
16171651

1618-
if (!isLexingRawMode())
1652+
if (!isAllowedIDChar(CodePoint, LangOpts)) {
1653+
if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
1654+
return false;
1655+
if (!isLexingRawMode())
1656+
diagnoseInvalidUnicodeCodepointInIdentifier(
1657+
PP->getDiagnostics(), LangOpts, CodePoint,
1658+
makeCharRange(*this, CurPtr, UCNPtr),
1659+
/*IsFirst=*/false);
1660+
1661+
// We got a unicode codepoint that is neither a space nor a
1662+
// a valid identifier part.
1663+
// Carry on as if the codepoint was valid for recovery purposes.
1664+
} else if (!isLexingRawMode())
16191665
maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
16201666
makeCharRange(*this, CurPtr, UCNPtr),
16211667
/*IsFirst=*/false);
@@ -1638,11 +1684,21 @@ bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
16381684
(const llvm::UTF8 *)BufferEnd,
16391685
&CodePoint,
16401686
llvm::strictConversion);
1641-
if (Result != llvm::conversionOK ||
1642-
!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts))
1687+
if (Result != llvm::conversionOK)
16431688
return false;
16441689

1645-
if (!isLexingRawMode()) {
1690+
if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts)) {
1691+
if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
1692+
return false;
1693+
1694+
if (!isLexingRawMode())
1695+
diagnoseInvalidUnicodeCodepointInIdentifier(
1696+
PP->getDiagnostics(), LangOpts, CodePoint,
1697+
makeCharRange(*this, CurPtr, UnicodePtr), /*IsFirst=*/false);
1698+
// We got a unicode codepoint that is neither a space nor a
1699+
// a valid identifier part. Carry on as if the codepoint was
1700+
// valid for recovery purposes.
1701+
} else if (!isLexingRawMode()) {
16461702
maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
16471703
makeCharRange(*this, CurPtr, UnicodePtr),
16481704
/*IsFirst=*/false);
@@ -3136,10 +3192,8 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
31363192

31373193
bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
31383194
const char *CurPtr) {
3139-
static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
3140-
UnicodeWhitespaceCharRanges);
31413195
if (!isLexingRawMode() && !PP->isPreprocessedOutput() &&
3142-
UnicodeWhitespaceChars.contains(C)) {
3196+
isUnicodeWhitespace(C)) {
31433197
Diag(BufferPtr, diag::ext_unicode_whitespace)
31443198
<< makeCharRange(*this, BufferPtr, CurPtr);
31453199

@@ -3150,7 +3204,7 @@ bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
31503204
}
31513205

31523206
bool Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) {
3153-
if (isAllowedIDChar(C, LangOpts) && isAllowedInitiallyIDChar(C, LangOpts)) {
3207+
if (isAllowedInitiallyIDChar(C, LangOpts)) {
31543208
if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
31553209
!PP->isPreprocessedOutput()) {
31563210
maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C,
@@ -3165,8 +3219,8 @@ bool Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) {
31653219
}
31663220

31673221
if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
3168-
!PP->isPreprocessedOutput() &&
3169-
!isASCII(*BufferPtr) && !isAllowedIDChar(C, LangOpts)) {
3222+
!PP->isPreprocessedOutput() && !isASCII(*BufferPtr) &&
3223+
!isAllowedInitiallyIDChar(C, LangOpts) && !isUnicodeWhitespace(C)) {
31703224
// Non-ASCII characters tend to creep into source code unintentionally.
31713225
// Instead of letting the parser complain about the unknown token,
31723226
// just drop the character.
@@ -3176,9 +3230,9 @@ bool Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) {
31763230
// loophole in the mapping of Unicode characters to basic character set
31773231
// characters that allows us to map these particular characters to, say,
31783232
// whitespace.
3179-
Diag(BufferPtr, diag::err_non_ascii)
3180-
<< FixItHint::CreateRemoval(makeCharRange(*this, BufferPtr, CurPtr));
3181-
3233+
diagnoseInvalidUnicodeCodepointInIdentifier(
3234+
PP->getDiagnostics(), LangOpts, C,
3235+
makeCharRange(*this, BufferPtr, CurPtr), /*IsStart*/ true);
31823236
BufferPtr = CurPtr;
31833237
return false;
31843238
}

0 commit comments

Comments
 (0)