@@ -1025,7 +1025,7 @@ static const char * const hangul_syllables[][3] = {
10251025
10261026/* These ranges need to match makeunicodedata.py:cjk_ranges. */
10271027static int
1028- is_unified_ideograph (Py_UCS4 code )
1028+ is_cjk_unified_ideograph (Py_UCS4 code )
10291029{
10301030 return
10311031 (0x3400 <= code && code <= 0x4DBF ) || /* CJK Ideograph Extension A */
@@ -1039,6 +1039,15 @@ is_unified_ideograph(Py_UCS4 code)
10391039 (0x31350 <= code && code <= 0x323AF ); /* CJK Ideograph Extension H */
10401040}
10411041
1042+ /* These ranges need to match makeunicodedata.py:tangut_ranges. */
1043+ static int
1044+ is_tangut_ideograph (Py_UCS4 code )
1045+ {
1046+ return
1047+ (0x17000 <= code && code <= 0x187F7 ) || /* Tangut */
1048+ (0x18D00 <= code && code <= 0x18D08 ); /* Tangut Supplement */
1049+ }
1050+
10421051/* macros used to determine if the given code point is in the PUA range that
10431052 * we are using to store aliases and named sequences */
10441053#define IS_ALIAS (cp ) ((cp >= aliases_start) && (cp < aliases_end))
@@ -1098,14 +1107,22 @@ _getucname(PyObject *self,
10981107 return 1 ;
10991108 }
11001109
1101- if (is_unified_ideograph (code )) {
1110+ if (is_cjk_unified_ideograph (code )) {
11021111 if (buflen < 28 )
11031112 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
11041113 return 0 ;
11051114 sprintf (buffer , "CJK UNIFIED IDEOGRAPH-%X" , code );
11061115 return 1 ;
11071116 }
11081117
1118+ if (is_tangut_ideograph (code )) {
1119+ if (buflen < 23 )
1120+ /* Worst case: TANGUT IDEOGRAPH-18D08 */
1121+ return 0 ;
1122+ sprintf (buffer , "TANGUT IDEOGRAPH-%X" , code );
1123+ return 1 ;
1124+ }
1125+
11091126 /* get offset into phrasebook */
11101127 offset = phrasebook_offset1 [(code >>phrasebook_shift )];
11111128 offset = phrasebook_offset2 [(offset <<phrasebook_shift ) +
@@ -1236,7 +1253,7 @@ _getcode(PyObject* self,
12361253 return 0 ;
12371254 }
12381255
1239- /* Check for unified ideographs. */
1256+ /* Check for CJK unified ideographs. */
12401257 if (strncmp (name , "CJK UNIFIED IDEOGRAPH-" , 22 ) == 0 ) {
12411258 /* Four or five hexdigits must follow. */
12421259 v = 0 ;
@@ -1254,12 +1271,38 @@ _getcode(PyObject* self,
12541271 return 0 ;
12551272 name ++ ;
12561273 }
1257- if (!is_unified_ideograph (v ))
1274+ if (!is_cjk_unified_ideograph (v ))
1275+ return 0 ;
1276+ * code = v ;
1277+ return 1 ;
1278+ }
1279+
1280+
1281+ /* Check for Tangut ideographs. */
1282+ if (strncmp (name , "TANGUT IDEOGRAPH-" , 17 ) == 0 ) {
1283+ /* Five hexdigits must follow. */
1284+ v = 0 ;
1285+ name += 17 ;
1286+ namelen -= 17 ;
1287+ if (namelen != 5 )
1288+ return 0 ;
1289+ while (namelen -- ) {
1290+ v *= 16 ;
1291+ if (* name >= '0' && * name <= '9' )
1292+ v += * name - '0' ;
1293+ else if (* name >= 'A' && * name <= 'F' )
1294+ v += * name - 'A' + 10 ;
1295+ else
1296+ return 0 ;
1297+ name ++ ;
1298+ }
1299+ if (!is_tangut_ideograph (v ))
12581300 return 0 ;
12591301 * code = v ;
12601302 return 1 ;
12611303 }
12621304
1305+
12631306 /* the following is the same as python's dictionary lookup, with
12641307 only minor changes. see the makeunicodedata script for more
12651308 details */
0 commit comments