Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit b5047fd

Browse files
committed
Add a workaround for a problem that UTF-8 strings can be corrupted
or broken by basic ctype functions in 4.4BSD descendants. This will be fixed in their future development branches but they'll keep the POSIX-incompatibility for their backward-compatiblities in near future.
1 parent 6db15d7 commit b5047fd

File tree

3 files changed

+71
-0
lines changed

3 files changed

+71
-0
lines changed

Include/pyport.h

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -411,6 +411,39 @@ extern int fdatasync(int);
411411
extern double hypot(double, double);
412412
#endif
413413

414+
415+
/*******************************************************************
416+
On 4.4BSD-descendants, ctype functions serves the whole range of
417+
wchar_t character set rather than single byte code points only.
418+
This characteristic can break some operations of string object
419+
including str.upper() and str.split() on UTF-8 locales. This
420+
workaround was provided by Tim Robbins of FreeBSD project. He said
421+
the incompatibility will be fixed in FreeBSD 6.
422+
********************************************************************/
423+
424+
#ifdef __FreeBSD__
425+
#include <osreldate.h>
426+
#if __FreeBSD_version > 500039
427+
#include <ctype.h>
428+
#include <wctype.h>
429+
#undef isalnum
430+
#define isalnum(c) iswalnum(btowc(c))
431+
#undef isalpha
432+
#define isalpha(c) iswalpha(btowc(c))
433+
#undef islower
434+
#define islower(c) iswlower(btowc(c))
435+
#undef isspace
436+
#define isspace(c) iswspace(btowc(c))
437+
#undef isupper
438+
#define isupper(c) iswupper(btowc(c))
439+
#undef tolower
440+
#define tolower(c) towlower(btowc(c))
441+
#undef toupper
442+
#define toupper(c) towupper(btowc(c))
443+
#endif
444+
#endif
445+
446+
414447
/* Declarations for symbol visibility.
415448
416449
PyAPI_FUNC(type): Declares a public Python API function and return type

Lib/test/test_locale.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,3 +47,38 @@ def testformat(formatstr, value, grouping = 0, output=None):
4747
locale.getpreferredencoding()
4848
finally:
4949
locale.setlocale(locale.LC_NUMERIC, oldlocale)
50+
51+
52+
# Test BSD Rune locale's bug for isctype functions.
53+
def teststrop(s, method, output):
54+
if verbose:
55+
print "%s.%s() =? %s ..." % (repr(s), method, repr(output)),
56+
result = getattr(s, method)()
57+
if result != output:
58+
if verbose:
59+
print "no"
60+
print "%s.%s() == %s != %s" % (repr(s), method, repr(result),
61+
repr(output))
62+
elif verbose:
63+
print "yes"
64+
65+
try:
66+
oldlocale = locale.setlocale(locale.LC_CTYPE)
67+
locale.setlocale(locale.LC_CTYPE, 'en_US.UTF-8')
68+
except locale.Error:
69+
pass
70+
else:
71+
try:
72+
teststrop('\x20', 'isspace', True)
73+
teststrop('\xa0', 'isspace', False)
74+
teststrop('\xa1', 'isspace', False)
75+
teststrop('\xc0', 'isalpha', False)
76+
teststrop('\xc0', 'isalnum', False)
77+
teststrop('\xc0', 'isupper', False)
78+
teststrop('\xc0', 'islower', False)
79+
teststrop('\xec\xa0\xbc', 'split', ['\xec\xa0\xbc'])
80+
teststrop('\xed\x95\xa0', 'strip', '\xed\x95\xa0')
81+
teststrop('\xcc\x85', 'lower', '\xcc\x85')
82+
teststrop('\xed\x95\xa0', 'upper', '\xed\x95\xa0')
83+
finally:
84+
locale.setlocale(locale.LC_CTYPE, oldlocale)

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,9 @@ Core and builtins
6464

6565
- Implemented bind_textdomain_codeset() in locale module.
6666

67+
- Added a workaround for proper string operations in BSDs. str.split
68+
and str.is* methods can now work correctly with UTF-8 locales.
69+
6770
Extension modules
6871
-----------------
6972

0 commit comments

Comments
 (0)