Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit fb10c3f

Browse files
committed
Minimal fix for the complaints about pickling Unicode objects. (SF
bugs #126161 and 123634). The solution doesn't use the unicode-escape encoding; that has other problems (it seems not 100% reversible). Rather, it transforms the input Unicode object slightly before encoding it using raw-unicode-escape, so that the decoding will reconstruct the original string: backslash and newline characters are translated into their \uXXXX counterparts. This is backwards incompatible for strings containing backslashes, but for some of those strings, the pickling was already broken. Note that SF bug #123634 complains specifically that cPickle fails to unpickle the pickle for u'' (the empty Unicode string) correctly. This was an off-by-one error in load_unicode(). XXX Ugliness: in order to do the modified raw-unicode-escape, I've cut-and-pasted a copy of PyUnicode_EncodeRawUnicodeEscape() into this file that also encodes '\\' and '\n'. It might be nice to migrate this into the Unicode implementation and give this encoding a new name ('half-raw-unicode-escape'? 'pickle-unicode-escape'?); that would help pickle.py too. But right now I can't be bothered with the necessary infrastructural changes.
1 parent 8b74b15 commit fb10c3f

1 file changed

Lines changed: 48 additions & 2 deletions

File tree

Modules/cPickle.c

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1149,6 +1149,51 @@ save_string(Picklerobject *self, PyObject *args, int doput) {
11491149
}
11501150

11511151

1152+
/* A copy of PyUnicode_EncodeRawUnicodeEscape() that also translates
1153+
backslash and newline characters to \uXXXX escapes. */
1154+
static PyObject *
1155+
modified_EncodeRawUnicodeEscape(const Py_UNICODE *s, int size)
1156+
{
1157+
PyObject *repr;
1158+
char *p;
1159+
char *q;
1160+
1161+
static const char *hexdigit = "0123456789ABCDEF";
1162+
1163+
repr = PyString_FromStringAndSize(NULL, 6 * size);
1164+
if (repr == NULL)
1165+
return NULL;
1166+
if (size == 0)
1167+
return repr;
1168+
1169+
p = q = PyString_AS_STRING(repr);
1170+
while (size-- > 0) {
1171+
Py_UNICODE ch = *s++;
1172+
/* Map 16-bit characters to '\uxxxx' */
1173+
if (ch >= 256 || ch == '\\' || ch == '\n') {
1174+
*p++ = '\\';
1175+
*p++ = 'u';
1176+
*p++ = hexdigit[(ch >> 12) & 0xf];
1177+
*p++ = hexdigit[(ch >> 8) & 0xf];
1178+
*p++ = hexdigit[(ch >> 4) & 0xf];
1179+
*p++ = hexdigit[ch & 15];
1180+
}
1181+
/* Copy everything else as-is */
1182+
else
1183+
*p++ = (char) ch;
1184+
}
1185+
*p = '\0';
1186+
if (_PyString_Resize(&repr, p - q))
1187+
goto onError;
1188+
1189+
return repr;
1190+
1191+
onError:
1192+
Py_DECREF(repr);
1193+
return NULL;
1194+
}
1195+
1196+
11521197
static int
11531198
save_unicode(Picklerobject *self, PyObject *args, int doput) {
11541199
int size, len;
@@ -1161,7 +1206,8 @@ save_unicode(Picklerobject *self, PyObject *args, int doput) {
11611206
char *repr_str;
11621207
static char string = UNICODE;
11631208

1164-
UNLESS (repr = PyUnicode_AsRawUnicodeEscapeString(args))
1209+
UNLESS(repr = modified_EncodeRawUnicodeEscape(
1210+
PyUnicode_AS_UNICODE(args), PyUnicode_GET_SIZE(args)))
11651211
return -1;
11661212

11671213
if ((len = PyString_Size(repr)) < 0)
@@ -2745,7 +2791,7 @@ load_unicode(Unpicklerobject *self) {
27452791
char *s;
27462792

27472793
if ((len = (*self->readline_func)(self, &s)) < 0) return -1;
2748-
if (len < 2) return bad_readline();
2794+
if (len < 1) return bad_readline();
27492795

27502796
UNLESS (str = PyUnicode_DecodeRawUnicodeEscape(s, len - 1, NULL))
27512797
goto finally;

0 commit comments

Comments
 (0)