Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 41980ca

Browse files
committed
Apply SF patch #1775604: This adds three new codecs (utf-32, utf-32-le and
ut-32-be). On narrow builds the codecs combine surrogate pairs in the unicode object into one codepoint on encoding and create surrogate pairs for codepoints outside the BMP on decoding. Lone surrogates are passed through unchanged in all cases. Backport to the trunk will follow.
1 parent 0661009 commit 41980ca

12 files changed

Lines changed: 1001 additions & 2 deletions

File tree

Doc/c-api/concrete.rst

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1405,6 +1405,74 @@ These are the UTF-8 codec APIs:
14051405
object. Error handling is "strict". Return *NULL* if an exception was raised
14061406
by the codec.
14071407

1408+
These are the UTF-32 codec APIs:
1409+
1410+
.. % --- UTF-32 Codecs ------------------------------------------------------ */
1411+
1412+
1413+
.. cfunction:: PyObject* PyUnicode_DecodeUTF32(const char *s, Py_ssize_t size, const char *errors, int *byteorder)
1414+
1415+
Decode *length* bytes from a UTF-32 encoded buffer string and return the
1416+
corresponding Unicode object. *errors* (if non-*NULL*) defines the error
1417+
handling. It defaults to "strict".
1418+
1419+
If *byteorder* is non-*NULL*, the decoder starts decoding using the given byte
1420+
order::
1421+
1422+
*byteorder == -1: little endian
1423+
*byteorder == 0: native order
1424+
*byteorder == 1: big endian
1425+
1426+
and then switches if the first four bytes of the input data are a byte order mark
1427+
(BOM) and the specified byte order is native order. This BOM is not copied into
1428+
the resulting Unicode string. After completion, *\*byteorder* is set to the
1429+
current byte order at the end of input data.
1430+
1431+
In a narrow build codepoints outside the BMP will be decoded as surrogate pairs.
1432+
1433+
If *byteorder* is *NULL*, the codec starts in native order mode.
1434+
1435+
Return *NULL* if an exception was raised by the codec.
1436+
1437+
.. versionadded:: 3.0
1438+
1439+
1440+
.. cfunction:: PyObject* PyUnicode_DecodeUTF32Stateful(const char *s, Py_ssize_t size, const char *errors, int *byteorder, Py_ssize_t *consumed)
1441+
1442+
If *consumed* is *NULL*, behave like :cfunc:`PyUnicode_DecodeUTF32`. If
1443+
*consumed* is not *NULL*, :cfunc:`PyUnicode_DecodeUTF32Stateful` will not treat
1444+
trailing incomplete UTF-32 byte sequences (such as a number of bytes not divisible
1445+
by four) as an error. Those bytes will not be decoded and the number of bytes
1446+
that have been decoded will be stored in *consumed*.
1447+
1448+
.. versionadded:: 3.0
1449+
1450+
1451+
.. cfunction:: PyObject* PyUnicode_EncodeUTF32(const Py_UNICODE *s, Py_ssize_t size, const char *errors, int byteorder)
1452+
1453+
Return a Python bytes object holding the UTF-32 encoded value of the Unicode
1454+
data in *s*. If *byteorder* is not ``0``, output is written according to the
1455+
following byte order::
1456+
1457+
byteorder == -1: little endian
1458+
byteorder == 0: native byte order (writes a BOM mark)
1459+
byteorder == 1: big endian
1460+
1461+
If byteorder is ``0``, the output string will always start with the Unicode BOM
1462+
mark (U+FEFF). In the other two modes, no BOM mark is prepended.
1463+
1464+
If *Py_UNICODE_WIDE* is not defined, surrogate pairs will be output
1465+
as a single codepoint.
1466+
1467+
Return *NULL* if an exception was raised by the codec.
1468+
1469+
1470+
.. cfunction:: PyObject* PyUnicode_AsUTF32String(PyObject *unicode)
1471+
1472+
Return a Python string using the UTF-32 encoding in native byte order. The
1473+
string always starts with a BOM mark. Error handling is "strict". Return
1474+
*NULL* if an exception was raised by the codec.
1475+
14081476
These are the UTF-16 codec APIs:
14091477

14101478
.. % --- UTF-16 Codecs ------------------------------------------------------ */

Doc/library/codecs.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1089,6 +1089,12 @@ particular, the following variants typically exist:
10891089
| shift_jisx0213 | shiftjisx0213, sjisx0213, | Japanese |
10901090
| | s_jisx0213 | |
10911091
+-----------------+--------------------------------+--------------------------------+
1092+
| utf_32 | U32, utf32 | all languages |
1093+
+-----------------+--------------------------------+--------------------------------+
1094+
| utf_32_be | UTF-32BE | all languages |
1095+
+-----------------+--------------------------------+--------------------------------+
1096+
| utf_32_le | UTF-32LE | all languages |
1097+
+-----------------+--------------------------------+--------------------------------+
10921098
| utf_16 | U16, utf16 | all languages |
10931099
+-----------------+--------------------------------+--------------------------------+
10941100
| utf_16_be | UTF-16BE | all languages (BMP only) |

Include/unicodeobject.h

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
138138
# define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString
139139
# define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String
140140
# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString
141+
# define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String
141142
# define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String
142143
# define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String
143144
# define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode
@@ -154,6 +155,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
154155
# define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap
155156
# define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1
156157
# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape
158+
# define PyUnicode_DecodeUTF32 PyUnicodeUCS2_DecodeUTF32
159+
# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS2_DecodeUTF32Stateful
157160
# define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16
158161
# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful
159162
# define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8
@@ -165,6 +168,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
165168
# define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal
166169
# define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1
167170
# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape
171+
# define PyUnicode_EncodeUTF32 PyUnicodeUCS2_EncodeUTF32
168172
# define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16
169173
# define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8
170174
# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape
@@ -225,6 +229,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
225229
# define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString
226230
# define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String
227231
# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString
232+
# define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String
228233
# define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String
229234
# define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String
230235
# define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode
@@ -241,6 +246,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
241246
# define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap
242247
# define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1
243248
# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape
249+
# define PyUnicode_DecodeUTF32 PyUnicodeUCS4_DecodeUTF32
250+
# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS4_DecodeUTF32Stateful
244251
# define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16
245252
# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful
246253
# define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8
@@ -252,6 +259,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
252259
# define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal
253260
# define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1
254261
# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape
262+
# define PyUnicode_EncodeUTF32 PyUnicodeUCS4_EncodeUTF32
255263
# define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16
256264
# define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8
257265
# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape
@@ -749,6 +757,80 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
749757
const char *errors /* error handling */
750758
);
751759

760+
/* --- UTF-32 Codecs ------------------------------------------------------ */
761+
762+
/* Decodes length bytes from a UTF-32 encoded buffer string and returns
763+
the corresponding Unicode object.
764+
765+
errors (if non-NULL) defines the error handling. It defaults
766+
to "strict".
767+
768+
If byteorder is non-NULL, the decoder starts decoding using the
769+
given byte order:
770+
771+
*byteorder == -1: little endian
772+
*byteorder == 0: native order
773+
*byteorder == 1: big endian
774+
775+
In native mode, the first four bytes of the stream are checked for a
776+
BOM mark. If found, the BOM mark is analysed, the byte order
777+
adjusted and the BOM skipped. In the other modes, no BOM mark
778+
interpretation is done. After completion, *byteorder is set to the
779+
current byte order at the end of input data.
780+
781+
If byteorder is NULL, the codec starts in native order mode.
782+
783+
*/
784+
785+
PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
786+
const char *string, /* UTF-32 encoded string */
787+
Py_ssize_t length, /* size of string */
788+
const char *errors, /* error handling */
789+
int *byteorder /* pointer to byteorder to use
790+
0=native;-1=LE,1=BE; updated on
791+
exit */
792+
);
793+
794+
PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
795+
const char *string, /* UTF-32 encoded string */
796+
Py_ssize_t length, /* size of string */
797+
const char *errors, /* error handling */
798+
int *byteorder, /* pointer to byteorder to use
799+
0=native;-1=LE,1=BE; updated on
800+
exit */
801+
Py_ssize_t *consumed /* bytes consumed */
802+
);
803+
804+
/* Returns a Python string using the UTF-32 encoding in native byte
805+
order. The string always starts with a BOM mark. */
806+
807+
PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
808+
PyObject *unicode /* Unicode object */
809+
);
810+
811+
/* Returns a Python string object holding the UTF-32 encoded value of
812+
the Unicode data.
813+
814+
If byteorder is not 0, output is written according to the following
815+
byte order:
816+
817+
byteorder == -1: little endian
818+
byteorder == 0: native byte order (writes a BOM mark)
819+
byteorder == 1: big endian
820+
821+
If byteorder is 0, the output string will always start with the
822+
Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
823+
prepended.
824+
825+
*/
826+
827+
PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
828+
const Py_UNICODE *data, /* Unicode char buffer */
829+
Py_ssize_t length, /* number of Py_UNICODE chars to encode */
830+
const char *errors, /* error handling */
831+
int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
832+
);
833+
752834
/* --- UTF-16 Codecs ------------------------------------------------------ */
753835

754836
/* Decodes length bytes from a UTF-16 encoded buffer string and returns

Lib/encodings/aliases.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -490,6 +490,16 @@
490490
'unicodelittleunmarked' : 'utf_16_le',
491491
'utf_16le' : 'utf_16_le',
492492

493+
# utf_32 codec
494+
'u32' : 'utf_32',
495+
'utf32' : 'utf_32',
496+
497+
# utf_32_be codec
498+
'utf_32be' : 'utf_32_be',
499+
500+
# utf_32_le codec
501+
'utf_32le' : 'utf_32_le',
502+
493503
# utf_7 codec
494504
'u7' : 'utf_7',
495505
'utf7' : 'utf_7',

Lib/encodings/utf_32.py

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
"""
2+
Python 'utf-32' Codec
3+
"""
4+
import codecs, sys
5+
6+
### Codec APIs
7+
8+
encode = codecs.utf_32_encode
9+
10+
def decode(input, errors='strict'):
11+
return codecs.utf_32_decode(input, errors, True)
12+
13+
class IncrementalEncoder(codecs.IncrementalEncoder):
14+
def __init__(self, errors='strict'):
15+
codecs.IncrementalEncoder.__init__(self, errors)
16+
self.encoder = None
17+
18+
def encode(self, input, final=False):
19+
if self.encoder is None:
20+
result = codecs.utf_32_encode(input, self.errors)[0]
21+
if sys.byteorder == 'little':
22+
self.encoder = codecs.utf_32_le_encode
23+
else:
24+
self.encoder = codecs.utf_32_be_encode
25+
return result
26+
return self.encoder(input, self.errors)[0]
27+
28+
def reset(self):
29+
codecs.IncrementalEncoder.reset(self)
30+
self.encoder = None
31+
32+
def getstate(self):
33+
# state info we return to the caller:
34+
# 0: stream is in natural order for this platform
35+
# 2: endianness hasn't been determined yet
36+
# (we're never writing in unnatural order)
37+
return (2 if self.encoder is None else 0)
38+
39+
def setstate(self, state):
40+
if state:
41+
self.encoder = None
42+
else:
43+
if sys.byteorder == 'little':
44+
self.encoder = codecs.utf_32_le_encode
45+
else:
46+
self.encoder = codecs.utf_32_be_encode
47+
48+
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
49+
def __init__(self, errors='strict'):
50+
codecs.BufferedIncrementalDecoder.__init__(self, errors)
51+
self.decoder = None
52+
53+
def _buffer_decode(self, input, errors, final):
54+
if self.decoder is None:
55+
(output, consumed, byteorder) = \
56+
codecs.utf_32_ex_decode(input, errors, 0, final)
57+
if byteorder == -1:
58+
self.decoder = codecs.utf_32_le_decode
59+
elif byteorder == 1:
60+
self.decoder = codecs.utf_32_be_decode
61+
elif consumed >= 4:
62+
raise UnicodeError("UTF-32 stream does not start with BOM")
63+
return (output, consumed)
64+
return self.decoder(input, self.errors, final)
65+
66+
def reset(self):
67+
codecs.BufferedIncrementalDecoder.reset(self)
68+
self.decoder = None
69+
70+
def getstate(self):
71+
# additonal state info from the base class must be None here,
72+
# as it isn't passed along to the caller
73+
state = codecs.BufferedIncrementalDecoder.getstate(self)[0]
74+
# additional state info we pass to the caller:
75+
# 0: stream is in natural order for this platform
76+
# 1: stream is in unnatural order
77+
# 2: endianness hasn't been determined yet
78+
if self.decoder is None:
79+
return (state, 2)
80+
addstate = int((sys.byteorder == "big") !=
81+
(self.decoder is codecs.utf_32_be_decode))
82+
return (state, addstate)
83+
84+
def setstate(self, state):
85+
# state[1] will be ignored by BufferedIncrementalDecoder.setstate()
86+
codecs.BufferedIncrementalDecoder.setstate(self, state)
87+
state = state[1]
88+
if state == 0:
89+
self.decoder = (codecs.utf_32_be_decode
90+
if sys.byteorder == "big"
91+
else codecs.utf_32_le_decode)
92+
elif state == 1:
93+
self.decoder = (codecs.utf_32_le_decode
94+
if sys.byteorder == "big"
95+
else codecs.utf_32_be_decode)
96+
else:
97+
self.decoder = None
98+
99+
class StreamWriter(codecs.StreamWriter):
100+
def __init__(self, stream, errors='strict'):
101+
self.bom_written = False
102+
codecs.StreamWriter.__init__(self, stream, errors)
103+
104+
def encode(self, input, errors='strict'):
105+
self.bom_written = True
106+
result = codecs.utf_32_encode(input, errors)
107+
if sys.byteorder == 'little':
108+
self.encode = codecs.utf_32_le_encode
109+
else:
110+
self.encode = codecs.utf_32_be_encode
111+
return result
112+
113+
class StreamReader(codecs.StreamReader):
114+
115+
def reset(self):
116+
codecs.StreamReader.reset(self)
117+
try:
118+
del self.decode
119+
except AttributeError:
120+
pass
121+
122+
def decode(self, input, errors='strict'):
123+
(object, consumed, byteorder) = \
124+
codecs.utf_32_ex_decode(input, errors, 0, False)
125+
if byteorder == -1:
126+
self.decode = codecs.utf_32_le_decode
127+
elif byteorder == 1:
128+
self.decode = codecs.utf_32_le_decode
129+
elif consumed>=4:
130+
raise UnicodeError,"UTF-32 stream does not start with BOM"
131+
return (object, consumed)
132+
133+
### encodings module API
134+
135+
def getregentry():
136+
return codecs.CodecInfo(
137+
name='utf-32',
138+
encode=encode,
139+
decode=decode,
140+
incrementalencoder=IncrementalEncoder,
141+
incrementaldecoder=IncrementalDecoder,
142+
streamreader=StreamReader,
143+
streamwriter=StreamWriter,
144+
)

0 commit comments

Comments
 (0)