Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit e450185

Browse files
committed
Issue #5006: Better handling of unicode byte-order marks (BOM) in the io library.
This means, for example, that opening an UTF-16 text file in append mode doesn't add a BOM at the end of the file if the file isn't empty.
1 parent b565577 commit e450185

6 files changed

Lines changed: 168 additions & 22 deletions

File tree

Lib/_pyio.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1436,6 +1436,15 @@ def __init__(self, buffer, encoding=None, errors=None, newline=None,
14361436
self._snapshot = None # info for reconstructing decoder state
14371437
self._seekable = self._telling = self.buffer.seekable()
14381438

1439+
if self._seekable and self.writable():
1440+
position = self.buffer.tell()
1441+
if position != 0:
1442+
try:
1443+
self._get_encoder().setstate(0)
1444+
except LookupError:
1445+
# Sometimes the encoder doesn't exist
1446+
pass
1447+
14391448
# self._snapshot is either None, or a tuple (dec_flags, next_input)
14401449
# where dec_flags is the second (integer) item of the decoder state
14411450
# and next_input is the chunk of input bytes that comes next after the
@@ -1741,6 +1750,17 @@ def seek(self, cookie, whence=0):
17411750
raise IOError("can't restore logical file position")
17421751
self._decoded_chars_used = chars_to_skip
17431752

1753+
# Finally, reset the encoder (merely useful for proper BOM handling)
1754+
try:
1755+
encoder = self._encoder or self._get_encoder()
1756+
except LookupError:
1757+
# Sometimes the encoder doesn't exist
1758+
pass
1759+
else:
1760+
if cookie != 0:
1761+
encoder.setstate(0)
1762+
else:
1763+
encoder.reset()
17441764
return cookie
17451765

17461766
def read(self, n=None):

Lib/test/test_io.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1963,6 +1963,37 @@ def test_issue2282(self):
19631963

19641964
self.assertEqual(buffer.seekable(), txt.seekable())
19651965

1966+
def test_append_bom(self):
1967+
# The BOM is not written again when appending to a non-empty file
1968+
filename = support.TESTFN
1969+
for charset in ('utf-8-sig', 'utf-16', 'utf-32'):
1970+
with self.open(filename, 'w', encoding=charset) as f:
1971+
f.write('aaa')
1972+
pos = f.tell()
1973+
with self.open(filename, 'rb') as f:
1974+
self.assertEquals(f.read(), 'aaa'.encode(charset))
1975+
1976+
with self.open(filename, 'a', encoding=charset) as f:
1977+
f.write('xxx')
1978+
with self.open(filename, 'rb') as f:
1979+
self.assertEquals(f.read(), 'aaaxxx'.encode(charset))
1980+
1981+
def test_seek_bom(self):
1982+
# Same test, but when seeking manually
1983+
filename = support.TESTFN
1984+
for charset in ('utf-8-sig', 'utf-16', 'utf-32'):
1985+
with self.open(filename, 'w', encoding=charset) as f:
1986+
f.write('aaa')
1987+
pos = f.tell()
1988+
with self.open(filename, 'r+', encoding=charset) as f:
1989+
f.seek(pos)
1990+
f.write('zzz')
1991+
f.seek(0)
1992+
f.write('bbb')
1993+
with self.open(filename, 'rb') as f:
1994+
self.assertEquals(f.read(), 'bbbzzz'.encode(charset))
1995+
1996+
19661997
class CTextIOWrapperTest(TextIOWrapperTest):
19671998

19681999
def test_initialization(self):

Misc/NEWS

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,11 @@ Core and Builtins
2323
Library
2424
-------
2525

26+
- Issue #5006: Better handling of unicode byte-order marks (BOM) in the io
27+
library. This means, for example, that opening an UTF-16 text file in
28+
append mode doesn't add a BOM at the end of the file if the file isn't
29+
empty.
30+
2631
- Issue #4050: inspect.findsource/getsource now raise an IOError if the 'source'
2732
file is a binary. Patch by Brodie Rao, tests by Daniel Diniz. This fix
2833
corrects a pydoc regression.

Modules/_io/_iomodule.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,13 +41,15 @@ PyObject *_PyIO_str_readline;
4141
PyObject *_PyIO_str_reset;
4242
PyObject *_PyIO_str_seek;
4343
PyObject *_PyIO_str_seekable;
44+
PyObject *_PyIO_str_setstate;
4445
PyObject *_PyIO_str_tell;
4546
PyObject *_PyIO_str_truncate;
4647
PyObject *_PyIO_str_writable;
4748
PyObject *_PyIO_str_write;
4849

4950
PyObject *_PyIO_empty_str;
5051
PyObject *_PyIO_empty_bytes;
52+
PyObject *_PyIO_zero;
5153

5254

5355
PyDoc_STRVAR(module_doc,
@@ -734,6 +736,8 @@ PyInit__io(void)
734736
goto fail;
735737
if (!(_PyIO_str_seekable = PyUnicode_InternFromString("seekable")))
736738
goto fail;
739+
if (!(_PyIO_str_setstate = PyUnicode_InternFromString("setstate")))
740+
goto fail;
737741
if (!(_PyIO_str_tell = PyUnicode_InternFromString("tell")))
738742
goto fail;
739743
if (!(_PyIO_str_truncate = PyUnicode_InternFromString("truncate")))
@@ -747,6 +751,8 @@ PyInit__io(void)
747751
goto fail;
748752
if (!(_PyIO_empty_bytes = PyBytes_FromStringAndSize(NULL, 0)))
749753
goto fail;
754+
if (!(_PyIO_zero = PyLong_FromLong(0L)))
755+
goto fail;
750756

751757
state->initialized = 1;
752758

Modules/_io/_iomodule.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,10 +141,12 @@ extern PyObject *_PyIO_str_readline;
141141
extern PyObject *_PyIO_str_reset;
142142
extern PyObject *_PyIO_str_seek;
143143
extern PyObject *_PyIO_str_seekable;
144+
extern PyObject *_PyIO_str_setstate;
144145
extern PyObject *_PyIO_str_tell;
145146
extern PyObject *_PyIO_str_truncate;
146147
extern PyObject *_PyIO_str_writable;
147148
extern PyObject *_PyIO_str_write;
148149

149150
extern PyObject *_PyIO_empty_str;
150151
extern PyObject *_PyIO_empty_bytes;
152+
extern PyObject *_PyIO_zero;

Modules/_io/textio.c

Lines changed: 104 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -647,6 +647,8 @@ typedef struct
647647
char telling;
648648
/* Specialized encoding func (see below) */
649649
encodefunc_t encodefunc;
650+
/* Whether or not it's the start of the stream */
651+
char encoding_start_of_stream;
650652

651653
/* Reads and writes are internally buffered in order to speed things up.
652654
However, any read will first flush the write buffer if itsn't empty.
@@ -707,21 +709,50 @@ utf16le_encode(PyTextIOWrapperObject *self, PyObject *text)
707709
static PyObject *
708710
utf16_encode(PyTextIOWrapperObject *self, PyObject *text)
709711
{
710-
PyObject *res;
711-
res = PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(text),
712-
PyUnicode_GET_SIZE(text),
713-
PyBytes_AS_STRING(self->errors), 0);
714-
if (res == NULL)
715-
return NULL;
716-
/* Next writes will skip the BOM and use native byte ordering */
712+
if (!self->encoding_start_of_stream) {
713+
/* Skip the BOM and use native byte ordering */
717714
#if defined(WORDS_BIGENDIAN)
718-
self->encodefunc = (encodefunc_t) utf16be_encode;
715+
return utf16be_encode(self, text);
719716
#else
720-
self->encodefunc = (encodefunc_t) utf16le_encode;
717+
return utf16le_encode(self, text);
721718
#endif
722-
return res;
719+
}
720+
return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(text),
721+
PyUnicode_GET_SIZE(text),
722+
PyBytes_AS_STRING(self->errors), 0);
723723
}
724724

725+
static PyObject *
726+
utf32be_encode(PyTextIOWrapperObject *self, PyObject *text)
727+
{
728+
return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(text),
729+
PyUnicode_GET_SIZE(text),
730+
PyBytes_AS_STRING(self->errors), 1);
731+
}
732+
733+
static PyObject *
734+
utf32le_encode(PyTextIOWrapperObject *self, PyObject *text)
735+
{
736+
return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(text),
737+
PyUnicode_GET_SIZE(text),
738+
PyBytes_AS_STRING(self->errors), -1);
739+
}
740+
741+
static PyObject *
742+
utf32_encode(PyTextIOWrapperObject *self, PyObject *text)
743+
{
744+
if (!self->encoding_start_of_stream) {
745+
/* Skip the BOM and use native byte ordering */
746+
#if defined(WORDS_BIGENDIAN)
747+
return utf32be_encode(self, text);
748+
#else
749+
return utf32le_encode(self, text);
750+
#endif
751+
}
752+
return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(text),
753+
PyUnicode_GET_SIZE(text),
754+
PyBytes_AS_STRING(self->errors), 0);
755+
}
725756

726757
static PyObject *
727758
utf8_encode(PyTextIOWrapperObject *self, PyObject *text)
@@ -749,10 +780,13 @@ typedef struct {
749780
static encodefuncentry encodefuncs[] = {
750781
{"ascii", (encodefunc_t) ascii_encode},
751782
{"iso8859-1", (encodefunc_t) latin1_encode},
783+
{"utf-8", (encodefunc_t) utf8_encode},
752784
{"utf-16-be", (encodefunc_t) utf16be_encode},
753785
{"utf-16-le", (encodefunc_t) utf16le_encode},
754786
{"utf-16", (encodefunc_t) utf16_encode},
755-
{"utf-8", (encodefunc_t) utf8_encode},
787+
{"utf-32-be", (encodefunc_t) utf32be_encode},
788+
{"utf-32-le", (encodefunc_t) utf32le_encode},
789+
{"utf-32", (encodefunc_t) utf32_encode},
756790
{NULL, NULL}
757791
};
758792

@@ -978,6 +1012,33 @@ TextIOWrapper_init(PyTextIOWrapperObject *self, PyObject *args, PyObject *kwds)
9781012
self->seekable = self->telling = PyObject_IsTrue(res);
9791013
Py_DECREF(res);
9801014

1015+
self->encoding_start_of_stream = 0;
1016+
if (self->seekable && self->encoder) {
1017+
PyObject *cookieObj;
1018+
int cmp;
1019+
1020+
self->encoding_start_of_stream = 1;
1021+
1022+
cookieObj = PyObject_CallMethodObjArgs(buffer, _PyIO_str_tell, NULL);
1023+
if (cookieObj == NULL)
1024+
goto error;
1025+
1026+
cmp = PyObject_RichCompareBool(cookieObj, _PyIO_zero, Py_EQ);
1027+
Py_DECREF(cookieObj);
1028+
if (cmp < 0) {
1029+
goto error;
1030+
}
1031+
1032+
if (cmp == 0) {
1033+
self->encoding_start_of_stream = 0;
1034+
res = PyObject_CallMethodObjArgs(self->encoder, _PyIO_str_setstate,
1035+
_PyIO_zero, NULL);
1036+
if (res == NULL)
1037+
goto error;
1038+
Py_DECREF(res);
1039+
}
1040+
}
1041+
9811042
self->ok = 1;
9821043
return 0;
9831044

@@ -1192,8 +1253,10 @@ TextIOWrapper_write(PyTextIOWrapperObject *self, PyObject *args)
11921253
needflush = 1;
11931254

11941255
/* XXX What if we were just reading? */
1195-
if (self->encodefunc != NULL)
1256+
if (self->encodefunc != NULL) {
11961257
b = (*self->encodefunc)((PyObject *) self, text);
1258+
self->encoding_start_of_stream = 0;
1259+
}
11971260
else
11981261
b = PyObject_CallMethodObjArgs(self->encoder,
11991262
_PyIO_str_encode, text, NULL);
@@ -1847,24 +1910,38 @@ _TextIOWrapper_decoder_setstate(PyTextIOWrapperObject *self,
18471910
return 0;
18481911
}
18491912

1913+
static int
1914+
_TextIOWrapper_encoder_setstate(PyTextIOWrapperObject *self,
1915+
CookieStruct *cookie)
1916+
{
1917+
PyObject *res;
1918+
/* Same as _TextIOWrapper_decoder_setstate() above. */
1919+
if (cookie->start_pos == 0 && cookie->dec_flags == 0) {
1920+
res = PyObject_CallMethodObjArgs(self->encoder, _PyIO_str_reset, NULL);
1921+
self->encoding_start_of_stream = 1;
1922+
}
1923+
else {
1924+
res = PyObject_CallMethodObjArgs(self->encoder, _PyIO_str_setstate,
1925+
_PyIO_zero, NULL);
1926+
self->encoding_start_of_stream = 0;
1927+
}
1928+
if (res == NULL)
1929+
return -1;
1930+
Py_DECREF(res);
1931+
return 0;
1932+
}
1933+
18501934
static PyObject *
18511935
TextIOWrapper_seek(PyTextIOWrapperObject *self, PyObject *args)
18521936
{
18531937
PyObject *cookieObj, *posobj;
18541938
CookieStruct cookie;
18551939
int whence = 0;
1856-
static PyObject *zero = NULL;
18571940
PyObject *res;
18581941
int cmp;
18591942

18601943
CHECK_INITIALIZED(self);
18611944

1862-
if (zero == NULL) {
1863-
zero = PyLong_FromLong(0L);
1864-
if (zero == NULL)
1865-
return NULL;
1866-
}
1867-
18681945
if (!PyArg_ParseTuple(args, "O|i:seek", &cookieObj, &whence))
18691946
return NULL;
18701947
CHECK_CLOSED(self);
@@ -1879,7 +1956,7 @@ TextIOWrapper_seek(PyTextIOWrapperObject *self, PyObject *args)
18791956

18801957
if (whence == 1) {
18811958
/* seek relative to current position */
1882-
cmp = PyObject_RichCompareBool(cookieObj, zero, Py_EQ);
1959+
cmp = PyObject_RichCompareBool(cookieObj, _PyIO_zero, Py_EQ);
18831960
if (cmp < 0)
18841961
goto fail;
18851962

@@ -1900,7 +1977,7 @@ TextIOWrapper_seek(PyTextIOWrapperObject *self, PyObject *args)
19001977
else if (whence == 2) {
19011978
/* seek relative to end of file */
19021979

1903-
cmp = PyObject_RichCompareBool(cookieObj, zero, Py_EQ);
1980+
cmp = PyObject_RichCompareBool(cookieObj, _PyIO_zero, Py_EQ);
19041981
if (cmp < 0)
19051982
goto fail;
19061983

@@ -1934,7 +2011,7 @@ TextIOWrapper_seek(PyTextIOWrapperObject *self, PyObject *args)
19342011
goto fail;
19352012
}
19362013

1937-
cmp = PyObject_RichCompareBool(cookieObj, zero, Py_LT);
2014+
cmp = PyObject_RichCompareBool(cookieObj, _PyIO_zero, Py_LT);
19382015
if (cmp < 0)
19392016
goto fail;
19402017

@@ -2013,6 +2090,11 @@ TextIOWrapper_seek(PyTextIOWrapperObject *self, PyObject *args)
20132090
goto fail;
20142091
}
20152092

2093+
/* Finally, reset the encoder (merely useful for proper BOM handling) */
2094+
if (self->encoder) {
2095+
if (_TextIOWrapper_encoder_setstate(self, &cookie) < 0)
2096+
goto fail;
2097+
}
20162098
return cookieObj;
20172099
fail:
20182100
Py_XDECREF(cookieObj);

0 commit comments

Comments
 (0)