Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit daa251c

Browse files
committed
Patch # 1302 by Christian Heimes (with some love from me :-).
The patch fixes the output for profile and cProfile. Another patch from Alexandre and me added additional calls to the UTF-8 codec.
1 parent 79b79ee commit daa251c

2 files changed

Lines changed: 48 additions & 12 deletions

File tree

Lib/test/regrtest.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1119,6 +1119,15 @@ def __init__(self):
11191119
if not os.path.supports_unicode_filenames:
11201120
self.expected.add('test_pep277')
11211121

1122+
# doctest, profile and cProfile tests fail when the codec for the fs
1123+
# encoding isn't built in because PyUnicode_Decode() adds two calls
1124+
# into Python.
1125+
encs = ("utf-8", "latin-1", "ascii", "mbcs", "utf-16", "utf-32")
1126+
if sys.getfilesystemencoding().lower() not in encs:
1127+
self.expected.add('test_profile')
1128+
self.expected.add('test_cProfile')
1129+
self.expected.add('test_doctest')
1130+
11221131
try:
11231132
from test import test_socket_ssl
11241133
except ImportError:

Objects/unicodeobject.c

Lines changed: 39 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
4141

4242
#define PY_SSIZE_T_CLEAN
4343
#include "Python.h"
44+
#include "bytes_methods.h"
4445

4546
#include "unicodeobject.h"
4647
#include "ucnhash.h"
@@ -592,9 +593,9 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
592593
if (*f == '%') {
593594
const char* p = f;
594595
width = 0;
595-
while (isdigit(Py_CHARMASK(*f)))
596+
while (ISDIGIT(*f))
596597
width = (width*10) + *f++ - '0';
597-
while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
598+
while (*++f && *f != '%' && !ISALPHA(*f))
598599
;
599600

600601
/* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
@@ -755,12 +756,12 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
755756
zeropad = (*f == '0');
756757
/* parse the width.precision part */
757758
width = 0;
758-
while (isdigit(Py_CHARMASK(*f)))
759+
while (ISDIGIT(*f))
759760
width = (width*10) + *f++ - '0';
760761
precision = 0;
761762
if (*f == '.') {
762763
f++;
763-
while (isdigit(Py_CHARMASK(*f)))
764+
while (ISDIGIT(*f))
764765
precision = (precision*10) + *f++ - '0';
765766
}
766767
/* handle the long flag, but only for %ld and %lu.
@@ -1056,21 +1057,47 @@ PyObject *PyUnicode_Decode(const char *s,
10561057
{
10571058
PyObject *buffer = NULL, *unicode;
10581059
Py_buffer info;
1060+
char lower[20]; /* Enough for any encoding name we recognize */
1061+
char *l;
1062+
const char *e;
10591063

10601064
if (encoding == NULL)
1061-
encoding = PyUnicode_GetDefaultEncoding();
1065+
encoding = PyUnicode_GetDefaultEncoding();
1066+
1067+
/* Convert encoding to lower case and replace '_' with '-' in order to
1068+
catch e.g. UTF_8 */
1069+
e = encoding;
1070+
l = lower;
1071+
while (*e && l < &lower[(sizeof lower) - 2]) {
1072+
if (ISUPPER(*e)) {
1073+
*l++ = TOLOWER(*e++);
1074+
}
1075+
else if (*e == '_') {
1076+
*l++ = '-';
1077+
e++;
1078+
}
1079+
else {
1080+
*l++ = *e++;
1081+
}
1082+
}
1083+
*l = '\0';
10621084

10631085
/* Shortcuts for common default encodings */
1064-
if (strcmp(encoding, "utf-8") == 0)
1086+
if (strcmp(lower, "utf-8") == 0)
10651087
return PyUnicode_DecodeUTF8(s, size, errors);
1066-
else if (strcmp(encoding, "latin-1") == 0)
1088+
else if ((strcmp(lower, "latin-1") == 0) ||
1089+
(strcmp(lower, "iso-8859-1") == 0))
10671090
return PyUnicode_DecodeLatin1(s, size, errors);
10681091
#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1069-
else if (strcmp(encoding, "mbcs") == 0)
1092+
else if (strcmp(lower, "mbcs") == 0)
10701093
return PyUnicode_DecodeMBCS(s, size, errors);
10711094
#endif
1072-
else if (strcmp(encoding, "ascii") == 0)
1095+
else if (strcmp(lower, "ascii") == 0)
10731096
return PyUnicode_DecodeASCII(s, size, errors);
1097+
else if (strcmp(lower, "utf-16") == 0)
1098+
return PyUnicode_DecodeUTF16(s, size, errors, 0);
1099+
else if (strcmp(lower, "utf-32") == 0)
1100+
return PyUnicode_DecodeUTF32(s, size, errors, 0);
10741101

10751102
/* Decode via the codec registry */
10761103
buffer = NULL;
@@ -1470,7 +1497,7 @@ char utf7_special[128] = {
14701497
#define B64(n) \
14711498
("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
14721499
#define B64CHAR(c) \
1473-
(isalnum(c) || (c) == '+' || (c) == '/')
1500+
(ISALNUM(c) || (c) == '+' || (c) == '/')
14741501
#define UB64(c) \
14751502
((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
14761503
(c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
@@ -2703,7 +2730,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
27032730
}
27042731
for (i = 0; i < digits; ++i) {
27052732
c = (unsigned char) s[i];
2706-
if (!isxdigit(c)) {
2733+
if (!ISXDIGIT(c)) {
27072734
endinpos = (s+i+1)-starts;
27082735
if (unicode_decode_call_errorhandler(
27092736
errors, &errorHandler,
@@ -3077,7 +3104,7 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
30773104
outpos = p-PyUnicode_AS_UNICODE(v);
30783105
for (x = 0, i = 0; i < count; ++i, ++s) {
30793106
c = (unsigned char)*s;
3080-
if (!isxdigit(c)) {
3107+
if (!ISXDIGIT(c)) {
30813108
endinpos = s-starts;
30823109
if (unicode_decode_call_errorhandler(
30833110
errors, &errorHandler,

0 commit comments

Comments
 (0)