@@ -1446,19 +1446,30 @@ void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {
14461446 IsAtPhysicalStartOfLine = StartOfLine;
14471447}
14481448
1449+ static bool isUnicodeWhitespace (uint32_t Codepoint) {
1450+ static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars (
1451+ UnicodeWhitespaceCharRanges);
1452+ return UnicodeWhitespaceChars.contains (Codepoint);
1453+ }
1454+
14491455static bool isAllowedIDChar (uint32_t C, const LangOptions &LangOpts) {
14501456 if (LangOpts.AsmPreprocessor ) {
14511457 return false ;
14521458 } else if (LangOpts.DollarIdents && ' $' == C) {
14531459 return true ;
1454- } else if (LangOpts.CPlusPlus11 || LangOpts.C11 ) {
1460+ } else if (LangOpts.CPlusPlus ) {
1461+ // A non-leading codepoint must have the XID_Continue property.
1462+ // XIDContinueRanges doesn't contains characters also in XIDStartRanges,
1463+ // so we need to check both tables.
1464+ // '_' doesn't have the XID_Continue property but is allowed in C++.
1465+ static const llvm::sys::UnicodeCharSet XIDStartChars (XIDStartRanges);
1466+ static const llvm::sys::UnicodeCharSet XIDContinueChars (XIDContinueRanges);
1467+ return C == ' _' || XIDStartChars.contains (C) ||
1468+ XIDContinueChars.contains (C);
1469+ } else if (LangOpts.C11 ) {
14551470 static const llvm::sys::UnicodeCharSet C11AllowedIDChars (
14561471 C11AllowedIDCharRanges);
14571472 return C11AllowedIDChars.contains (C);
1458- } else if (LangOpts.CPlusPlus ) {
1459- static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars (
1460- CXX03AllowedIDCharRanges);
1461- return CXX03AllowedIDChars.contains (C);
14621473 } else {
14631474 static const llvm::sys::UnicodeCharSet C99AllowedIDChars (
14641475 C99AllowedIDCharRanges);
@@ -1467,20 +1478,24 @@ static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) {
14671478}
14681479
14691480static bool isAllowedInitiallyIDChar (uint32_t C, const LangOptions &LangOpts) {
1470- assert (isAllowedIDChar (C, LangOpts));
14711481 if (LangOpts.AsmPreprocessor ) {
14721482 return false ;
1473- } else if (LangOpts.CPlusPlus11 || LangOpts.C11 ) {
1483+ }
1484+ if (LangOpts.CPlusPlus ) {
1485+ static const llvm::sys::UnicodeCharSet XIDStartChars (XIDStartRanges);
1486+ // '_' doesn't have the XID_Start property but is allowed in C++.
1487+ return C == ' _' || XIDStartChars.contains (C);
1488+ }
1489+ if (!isAllowedIDChar (C, LangOpts))
1490+ return false ;
1491+ if (LangOpts.C11 ) {
14741492 static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars (
14751493 C11DisallowedInitialIDCharRanges);
14761494 return !C11DisallowedInitialIDChars.contains (C);
1477- } else if (LangOpts.CPlusPlus ) {
1478- return true ;
1479- } else {
1480- static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars (
1481- C99DisallowedInitialIDCharRanges);
1482- return !C99DisallowedInitialIDChars.contains (C);
14831495 }
1496+ static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars (
1497+ C99DisallowedInitialIDCharRanges);
1498+ return !C99DisallowedInitialIDChars.contains (C);
14841499}
14851500
14861501static inline CharSourceRange makeCharRange (Lexer &L, const char *Begin,
@@ -1512,16 +1527,6 @@ static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
15121527 << CannotStartIdentifier;
15131528 }
15141529 }
1515-
1516- // Check C++98 compatibility.
1517- if (!Diags.isIgnored (diag::warn_cxx98_compat_unicode_id, Range.getBegin ())) {
1518- static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars (
1519- CXX03AllowedIDCharRanges);
1520- if (!CXX03AllowedIDChars.contains (C)) {
1521- Diags.Report (Range.getBegin (), diag::warn_cxx98_compat_unicode_id)
1522- << Range;
1523- }
1524- }
15251530}
15261531
15271532// / After encountering UTF-8 character C and interpreting it as an identifier
@@ -1608,14 +1613,55 @@ static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C,
16081613 }
16091614}
16101615
1616+ static void diagnoseInvalidUnicodeCodepointInIdentifier (
1617+ DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint,
1618+ CharSourceRange Range, bool IsFirst) {
1619+ if (isASCII (CodePoint))
1620+ return ;
1621+
1622+ bool IsIDStart = isAllowedInitiallyIDChar (CodePoint, LangOpts);
1623+ bool IsIDContinue = IsIDStart || isAllowedIDChar (CodePoint, LangOpts);
1624+
1625+ if ((IsFirst && IsIDStart) || (!IsFirst && IsIDContinue))
1626+ return ;
1627+
1628+ bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue;
1629+
1630+ llvm::SmallString<5 > CharBuf;
1631+ llvm::raw_svector_ostream CharOS (CharBuf);
1632+ llvm::write_hex (CharOS, CodePoint, llvm::HexPrintStyle::Upper, 4 );
1633+
1634+ if (!IsFirst || InvalidOnlyAtStart) {
1635+ Diags.Report (Range.getBegin (), diag::err_character_not_allowed_identifier)
1636+ << Range << CharBuf << int (InvalidOnlyAtStart)
1637+ << FixItHint::CreateRemoval (Range);
1638+ } else {
1639+ Diags.Report (Range.getBegin (), diag::err_character_not_allowed)
1640+ << Range << CharBuf << FixItHint::CreateRemoval (Range);
1641+ }
1642+ }
1643+
16111644bool Lexer::tryConsumeIdentifierUCN (const char *&CurPtr, unsigned Size,
16121645 Token &Result) {
16131646 const char *UCNPtr = CurPtr + Size;
16141647 uint32_t CodePoint = tryReadUCN (UCNPtr, CurPtr, /* Token=*/ nullptr );
1615- if (CodePoint == 0 || ! isAllowedIDChar (CodePoint, LangOpts))
1648+ if (CodePoint == 0 ) {
16161649 return false ;
1650+ }
16171651
1618- if (!isLexingRawMode ())
1652+ if (!isAllowedIDChar (CodePoint, LangOpts)) {
1653+ if (isASCII (CodePoint) || isUnicodeWhitespace (CodePoint))
1654+ return false ;
1655+ if (!isLexingRawMode ())
1656+ diagnoseInvalidUnicodeCodepointInIdentifier (
1657+ PP->getDiagnostics (), LangOpts, CodePoint,
1658+ makeCharRange (*this , CurPtr, UCNPtr),
1659+ /* IsFirst=*/ false );
1660+
1661+ // We got a unicode codepoint that is neither a space nor a
1662+ // a valid identifier part.
1663+ // Carry on as if the codepoint was valid for recovery purposes.
1664+ } else if (!isLexingRawMode ())
16191665 maybeDiagnoseIDCharCompat (PP->getDiagnostics (), CodePoint,
16201666 makeCharRange (*this , CurPtr, UCNPtr),
16211667 /* IsFirst=*/ false );
@@ -1638,11 +1684,21 @@ bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
16381684 (const llvm::UTF8 *)BufferEnd,
16391685 &CodePoint,
16401686 llvm::strictConversion);
1641- if (Result != llvm::conversionOK ||
1642- !isAllowedIDChar (static_cast <uint32_t >(CodePoint), LangOpts))
1687+ if (Result != llvm::conversionOK)
16431688 return false ;
16441689
1645- if (!isLexingRawMode ()) {
1690+ if (!isAllowedIDChar (static_cast <uint32_t >(CodePoint), LangOpts)) {
1691+ if (isASCII (CodePoint) || isUnicodeWhitespace (CodePoint))
1692+ return false ;
1693+
1694+ if (!isLexingRawMode ())
1695+ diagnoseInvalidUnicodeCodepointInIdentifier (
1696+ PP->getDiagnostics (), LangOpts, CodePoint,
1697+ makeCharRange (*this , CurPtr, UnicodePtr), /* IsFirst=*/ false );
1698+ // We got a unicode codepoint that is neither a space nor a
1699+ // a valid identifier part. Carry on as if the codepoint was
1700+ // valid for recovery purposes.
1701+ } else if (!isLexingRawMode ()) {
16461702 maybeDiagnoseIDCharCompat (PP->getDiagnostics (), CodePoint,
16471703 makeCharRange (*this , CurPtr, UnicodePtr),
16481704 /* IsFirst=*/ false );
@@ -3136,10 +3192,8 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
31363192
31373193bool Lexer::CheckUnicodeWhitespace (Token &Result, uint32_t C,
31383194 const char *CurPtr) {
3139- static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars (
3140- UnicodeWhitespaceCharRanges);
31413195 if (!isLexingRawMode () && !PP->isPreprocessedOutput () &&
3142- UnicodeWhitespaceChars. contains (C)) {
3196+ isUnicodeWhitespace (C)) {
31433197 Diag (BufferPtr, diag::ext_unicode_whitespace)
31443198 << makeCharRange (*this , BufferPtr, CurPtr);
31453199
@@ -3150,7 +3204,7 @@ bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
31503204}
31513205
31523206bool Lexer::LexUnicode (Token &Result, uint32_t C, const char *CurPtr) {
3153- if (isAllowedIDChar (C, LangOpts) && isAllowedInitiallyIDChar (C, LangOpts)) {
3207+ if (isAllowedInitiallyIDChar (C, LangOpts)) {
31543208 if (!isLexingRawMode () && !ParsingPreprocessorDirective &&
31553209 !PP->isPreprocessedOutput ()) {
31563210 maybeDiagnoseIDCharCompat (PP->getDiagnostics (), C,
@@ -3165,8 +3219,8 @@ bool Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) {
31653219 }
31663220
31673221 if (!isLexingRawMode () && !ParsingPreprocessorDirective &&
3168- !PP->isPreprocessedOutput () &&
3169- !isASCII (*BufferPtr ) && !isAllowedIDChar (C, LangOpts )) {
3222+ !PP->isPreprocessedOutput () && ! isASCII (*BufferPtr) &&
3223+ !isAllowedInitiallyIDChar (C, LangOpts ) && !isUnicodeWhitespace (C )) {
31703224 // Non-ASCII characters tend to creep into source code unintentionally.
31713225 // Instead of letting the parser complain about the unknown token,
31723226 // just drop the character.
@@ -3176,9 +3230,9 @@ bool Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) {
31763230 // loophole in the mapping of Unicode characters to basic character set
31773231 // characters that allows us to map these particular characters to, say,
31783232 // whitespace.
3179- Diag (BufferPtr, diag::err_non_ascii)
3180- << FixItHint::CreateRemoval ( makeCharRange (* this , BufferPtr, CurPtr));
3181-
3233+ diagnoseInvalidUnicodeCodepointInIdentifier (
3234+ PP-> getDiagnostics (), LangOpts, C,
3235+ makeCharRange (* this , BufferPtr, CurPtr), /* IsStart */ true );
31823236 BufferPtr = CurPtr;
31833237 return false ;
31843238 }
0 commit comments