@@ -1297,6 +1297,118 @@ def test_comparewithasciistring(self):
12971297 # CRASHES comparewithasciistring([], b'abc')
12981298 # CRASHES comparewithasciistring(NULL, b'abc')
12991299
1300+ @support .cpython_only
1301+ @unittest .skipIf (_testcapi is None , 'need _testcapi module' )
1302+ def test_equaltoutf8 (self ):
1303+ # Test PyUnicode_EqualToUTF8()
1304+ from _testcapi import unicode_equaltoutf8 as equaltoutf8
1305+ from _testcapi import unicode_asutf8andsize as asutf8andsize
1306+
1307+ strings = [
1308+ 'abc' , '\xa1 \xa2 \xa3 ' , '\u4f60 \u597d \u4e16 ' ,
1309+ '\U0001f600 \U0001f601 \U0001f602 ' ,
1310+ '\U0010ffff ' ,
1311+ ]
1312+ for s in strings :
1313+ # Call PyUnicode_AsUTF8AndSize() which creates the UTF-8
1314+ # encoded string cached in the Unicode object.
1315+ asutf8andsize (s , 0 )
1316+ b = s .encode ()
1317+ self .assertEqual (equaltoutf8 (s , b ), 1 ) # Use the UTF-8 cache.
1318+ s2 = b .decode () # New Unicode object without the UTF-8 cache.
1319+ self .assertEqual (equaltoutf8 (s2 , b ), 1 )
1320+ self .assertEqual (equaltoutf8 (s + 'x' , b + b'x' ), 1 )
1321+ self .assertEqual (equaltoutf8 (s + 'x' , b + b'y' ), 0 )
1322+ self .assertEqual (equaltoutf8 (s , b + b'\0 ' ), 1 )
1323+ self .assertEqual (equaltoutf8 (s2 , b + b'\0 ' ), 1 )
1324+ self .assertEqual (equaltoutf8 (s + '\0 ' , b + b'\0 ' ), 0 )
1325+ self .assertEqual (equaltoutf8 (s + '\0 ' , b ), 0 )
1326+ self .assertEqual (equaltoutf8 (s2 , b + b'x' ), 0 )
1327+ self .assertEqual (equaltoutf8 (s2 , b [:- 1 ]), 0 )
1328+ self .assertEqual (equaltoutf8 (s2 , b [:- 1 ] + b'x' ), 0 )
1329+
1330+ self .assertEqual (equaltoutf8 ('' , b'' ), 1 )
1331+ self .assertEqual (equaltoutf8 ('' , b'\0 ' ), 1 )
1332+
1333+ # embedded null chars/bytes
1334+ self .assertEqual (equaltoutf8 ('abc' , b'abc\0 def\0 ' ), 1 )
1335+ self .assertEqual (equaltoutf8 ('a\0 bc' , b'abc' ), 0 )
1336+ self .assertEqual (equaltoutf8 ('abc' , b'a\0 bc' ), 0 )
1337+
1338+ # Surrogate characters are always treated as not equal
1339+ self .assertEqual (equaltoutf8 ('\udcfe ' ,
1340+ '\udcfe ' .encode ("utf8" , "surrogateescape" )), 0 )
1341+ self .assertEqual (equaltoutf8 ('\udcfe ' ,
1342+ '\udcfe ' .encode ("utf8" , "surrogatepass" )), 0 )
1343+ self .assertEqual (equaltoutf8 ('\ud801 ' ,
1344+ '\ud801 ' .encode ("utf8" , "surrogatepass" )), 0 )
1345+
1346+ @support .cpython_only
1347+ @unittest .skipIf (_testcapi is None , 'need _testcapi module' )
1348+ def test_equaltoutf8andsize (self ):
1349+ # Test PyUnicode_EqualToUTF8AndSize()
1350+ from _testcapi import unicode_equaltoutf8andsize as equaltoutf8andsize
1351+ from _testcapi import unicode_asutf8andsize as asutf8andsize
1352+
1353+ strings = [
1354+ 'abc' , '\xa1 \xa2 \xa3 ' , '\u4f60 \u597d \u4e16 ' ,
1355+ '\U0001f600 \U0001f601 \U0001f602 ' ,
1356+ '\U0010ffff ' ,
1357+ ]
1358+ for s in strings :
1359+ # Call PyUnicode_AsUTF8AndSize() which creates the UTF-8
1360+ # encoded string cached in the Unicode object.
1361+ asutf8andsize (s , 0 )
1362+ b = s .encode ()
1363+ self .assertEqual (equaltoutf8andsize (s , b ), 1 ) # Use the UTF-8 cache.
1364+ s2 = b .decode () # New Unicode object without the UTF-8 cache.
1365+ self .assertEqual (equaltoutf8andsize (s2 , b ), 1 )
1366+ self .assertEqual (equaltoutf8andsize (s + 'x' , b + b'x' ), 1 )
1367+ self .assertEqual (equaltoutf8andsize (s + 'x' , b + b'y' ), 0 )
1368+ self .assertEqual (equaltoutf8andsize (s , b + b'\0 ' ), 0 )
1369+ self .assertEqual (equaltoutf8andsize (s2 , b + b'\0 ' ), 0 )
1370+ self .assertEqual (equaltoutf8andsize (s + '\0 ' , b + b'\0 ' ), 1 )
1371+ self .assertEqual (equaltoutf8andsize (s + '\0 ' , b ), 0 )
1372+ self .assertEqual (equaltoutf8andsize (s2 , b + b'x' ), 0 )
1373+ self .assertEqual (equaltoutf8andsize (s2 , b [:- 1 ]), 0 )
1374+ self .assertEqual (equaltoutf8andsize (s2 , b [:- 1 ] + b'x' ), 0 )
1375+ # Not null-terminated,
1376+ self .assertEqual (equaltoutf8andsize (s , b + b'x' , len (b )), 1 )
1377+ self .assertEqual (equaltoutf8andsize (s2 , b + b'x' , len (b )), 1 )
1378+ self .assertEqual (equaltoutf8andsize (s + '\0 ' , b + b'\0 x' , len (b ) + 1 ), 1 )
1379+ self .assertEqual (equaltoutf8andsize (s2 , b , len (b ) - 1 ), 0 )
1380+
1381+ self .assertEqual (equaltoutf8andsize ('' , b'' ), 1 )
1382+ self .assertEqual (equaltoutf8andsize ('' , b'\0 ' ), 0 )
1383+ self .assertEqual (equaltoutf8andsize ('' , b'x' , 0 ), 1 )
1384+
1385+ # embedded null chars/bytes
1386+ self .assertEqual (equaltoutf8andsize ('abc\0 def' , b'abc\0 def' ), 1 )
1387+ self .assertEqual (equaltoutf8andsize ('abc\0 def\0 ' , b'abc\0 def\0 ' ), 1 )
1388+
1389+ # Surrogate characters are always treated as not equal
1390+ self .assertEqual (equaltoutf8andsize ('\udcfe ' ,
1391+ '\udcfe ' .encode ("utf8" , "surrogateescape" )), 0 )
1392+ self .assertEqual (equaltoutf8andsize ('\udcfe ' ,
1393+ '\udcfe ' .encode ("utf8" , "surrogatepass" )), 0 )
1394+ self .assertEqual (equaltoutf8andsize ('\ud801 ' ,
1395+ '\ud801 ' .encode ("utf8" , "surrogatepass" )), 0 )
1396+
1397+ def check_not_equal_encoding (text , encoding ):
1398+ self .assertEqual (equaltoutf8andsize (text , text .encode (encoding )), 0 )
1399+ self .assertNotEqual (text .encode (encoding ), text .encode ("utf8" ))
1400+
1401+ # Strings encoded to other encodings are not equal to expected UTF8-encoding string
1402+ check_not_equal_encoding ('Stéphane' , 'latin1' )
1403+ check_not_equal_encoding ('Stéphane' , 'utf-16-le' ) # embedded null characters
1404+ check_not_equal_encoding ('北京市' , 'gbk' )
1405+
1406+ # CRASHES equaltoutf8andsize('abc', b'abc', -1)
1407+ # CRASHES equaltoutf8andsize(b'abc', b'abc')
1408+ # CRASHES equaltoutf8andsize([], b'abc')
1409+ # CRASHES equaltoutf8andsize(NULL, b'abc')
1410+ # CRASHES equaltoutf8andsize('abc', NULL)
1411+
13001412 @support .cpython_only
13011413 @unittest .skipIf (_testcapi is None , 'need _testcapi module' )
13021414 def test_richcompare (self ):
0 commit comments