Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 51ac580

Browse files
committed
On 17-Mar-2000, Marc-Andre Lemburg said:
Attached you find an update of the Unicode implementation. The patch is against the current CVS version. I would appreciate if someone with CVS checkin permissions could check the changes in. The patch contains all bugs and patches sent this week and also fixes a leak in the codecs code and a bug in the free list code for Unicode objects (which only shows up when compiling Python with Py_DEBUG; thanks to MarkH for spotting this one).
1 parent abc411b commit 51ac580

9 files changed

Lines changed: 61 additions & 39 deletions

File tree

Include/unicodeobject.h

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,5 @@
11
#ifndef Py_UNICODEOBJECT_H
22
#define Py_UNICODEOBJECT_H
3-
#ifdef __cplusplus
4-
extern "C" {
5-
#endif
63

74
/*
85
@@ -109,8 +106,9 @@ typedef unsigned short Py_UNICODE;
109106
/* --- Internal Unicode Operations ---------------------------------------- */
110107

111108
/* If you want Python to use the compiler's wctype.h functions instead
112-
of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS.
113-
This reduces the interpreter's code size. */
109+
of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
110+
configure Python using --with-ctype-functions. This reduces the
111+
interpreter's code size. */
114112

115113
#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
116114

@@ -169,6 +167,10 @@ typedef unsigned short Py_UNICODE;
169167
(!memcmp((string)->str + (offset), (substring)->str,\
170168
(substring)->length*sizeof(Py_UNICODE)))
171169

170+
#ifdef __cplusplus
171+
extern "C" {
172+
#endif
173+
172174
/* --- Unicode Type ------------------------------------------------------- */
173175

174176
typedef struct {
@@ -647,7 +649,7 @@ extern DL_IMPORT(int) PyUnicode_Find(
647649
int direction /* Find direction: +1 forward, -1 backward */
648650
);
649651

650-
/* Count the number of occurances of substr in str[start:end]. */
652+
/* Count the number of occurrences of substr in str[start:end]. */
651653

652654
extern DL_IMPORT(int) PyUnicode_Count(
653655
PyObject *str, /* String */
@@ -656,7 +658,7 @@ extern DL_IMPORT(int) PyUnicode_Count(
656658
int end /* Stop index */
657659
);
658660

659-
/* Replace at most maxcount occurances of substr in str with replstr
661+
/* Replace at most maxcount occurrences of substr in str with replstr
660662
and return the resulting Unicode object. */
661663

662664
extern DL_IMPORT(PyObject *) PyUnicode_Replace(

Lib/encodings/__init__.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,13 @@
3030
import string,codecs,aliases
3131

3232
_cache = {}
33-
_unkown = '--unkown--'
33+
_unknown = '--unknown--'
3434

3535
def search_function(encoding):
3636

3737
# Cache lookup
38-
entry = _cache.get(encoding,_unkown)
39-
if entry is not _unkown:
38+
entry = _cache.get(encoding,_unknown)
39+
if entry is not _unknown:
4040
return entry
4141

4242
# Import the module

Lib/test/test_string.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ def __init__(self): self.seq = [7, 'hello', 123L]
143143
test('translate', 'xyz', 'xyz', table)
144144

145145
test('replace', 'one!two!three!', 'one@two!three!', '!', '@', 1)
146+
test('replace', 'one!two!three!', 'onetwothree', '!', '')
146147
test('replace', 'one!two!three!', 'one@two@three!', '!', '@', 2)
147148
test('replace', 'one!two!three!', 'one@two@three@', '!', '@', 3)
148149
test('replace', 'one!two!three!', 'one@two@three@', '!', '@', 4)

Lib/test/test_unicode.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ def __init__(self): self.seq = [7, u'hello', 123L]
108108
test('translate', u'xyz', u'xyz', table)
109109

110110
test('replace', u'one!two!three!', u'one@two!three!', u'!', u'@', 1)
111+
test('replace', u'one!two!three!', u'onetwothree', '!', '')
111112
test('replace', u'one!two!three!', u'one@two@three!', u'!', u'@', 2)
112113
test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 3)
113114
test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 4)

Misc/unicode.txt

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -743,8 +743,9 @@ For explicit handling of files using Unicode, the standard
743743
stream codecs as available through the codecs module should
744744
be used.
745745

746-
XXX There should be a short-cut open(filename,mode,encoding) available which
747-
also assures that mode contains the 'b' character when needed.
746+
The codecs module should provide a short-cut open(filename,mode,encoding)
747+
available which also assures that mode contains the 'b' character when
748+
needed.
748749

749750

750751
File/Stream Input:
@@ -810,6 +811,10 @@ Unicode-Mappings:
810811
Introduction to Unicode (a little outdated by still nice to read):
811812
http://www.nada.kth.se/i18n/ucs/unicode-iso10646-oview.html
812813

814+
For comparison:
815+
Introducing Unicode to ECMAScript --
816+
http://www-4.ibm.com/software/developer/library/internationalization-support.html
817+
813818
Encodings:
814819

815820
Overview:
@@ -832,7 +837,7 @@ Encodings:
832837

833838
History of this Proposal:
834839
-------------------------
835-
1.2:
840+
1.2: Removed POD about codecs.open()
836841
1.1: Added note about comparisons and hash values. Added note about
837842
case mapping algorithms. Changed stream codecs .read() and
838843
.write() method to match the standard file-like object methods

Modules/stropmodule.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1054,7 +1054,7 @@ strop_translate(self, args)
10541054
10551055
strstr replacement for arbitrary blocks of memory.
10561056
1057-
Locates the first occurance in the memory pointed to by MEM of the
1057+
Locates the first occurrence in the memory pointed to by MEM of the
10581058
contents of memory pointed to by PAT. Returns the index into MEM if
10591059
found, or -1 if not found. If len of PAT is greater than length of
10601060
MEM, the function returns -1.

Objects/stringobject.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1395,7 +1395,7 @@ string_translate(self, args)
13951395
13961396
strstr replacement for arbitrary blocks of memory.
13971397
1398-
Locates the first occurance in the memory pointed to by MEM of the
1398+
Locates the first occurrence in the memory pointed to by MEM of the
13991399
contents of memory pointed to by PAT. Returns the index into MEM if
14001400
found, or -1 if not found. If len of PAT is greater than length of
14011401
MEM, the function returns -1.
@@ -1578,7 +1578,7 @@ string_replace(self, args)
15781578
return NULL;
15791579

15801580
if (sub_len <= 0) {
1581-
PyErr_SetString(PyExc_ValueError, "empty replacement string");
1581+
PyErr_SetString(PyExc_ValueError, "empty pattern string");
15821582
return NULL;
15831583
}
15841584
new_s = mymemreplace(str,len,sub,sub_len,repl,repl_len,count,&out_len);

Objects/unicodeobject.c

Lines changed: 24 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ Unicode Integration Proposal (see file Misc/unicode.txt).
8383
all objects on the free list having a size less than this
8484
limit. This reduces malloc() overhead for small Unicode objects.
8585
86-
At worse this will result in MAX_UNICODE_FREELIST_SIZE *
86+
At worst this will result in MAX_UNICODE_FREELIST_SIZE *
8787
(sizeof(PyUnicodeObject) + STAYALIVE_SIZE_LIMIT +
8888
malloc()-overhead) bytes of unused garbage.
8989
@@ -180,7 +180,7 @@ PyUnicodeObject *_PyUnicode_New(int length)
180180
unicode_freelist = *(PyUnicodeObject **)unicode_freelist;
181181
unicode_freelist_size--;
182182
unicode->ob_type = &PyUnicode_Type;
183-
_Py_NewReference(unicode);
183+
_Py_NewReference((PyObject *)unicode);
184184
if (unicode->str) {
185185
if (unicode->length < length &&
186186
_PyUnicode_Resize(unicode, length)) {
@@ -199,16 +199,19 @@ PyUnicodeObject *_PyUnicode_New(int length)
199199
unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
200200
}
201201

202-
if (!unicode->str) {
203-
PyMem_DEL(unicode);
204-
PyErr_NoMemory();
205-
return NULL;
206-
}
202+
if (!unicode->str)
203+
goto onError;
207204
unicode->str[length] = 0;
208205
unicode->length = length;
209206
unicode->hash = -1;
210207
unicode->utf8str = NULL;
211208
return unicode;
209+
210+
onError:
211+
_Py_ForgetReference((PyObject *)unicode);
212+
PyMem_DEL(unicode);
213+
PyErr_NoMemory();
214+
return NULL;
212215
}
213216

214217
static
@@ -224,7 +227,6 @@ void _PyUnicode_Free(register PyUnicodeObject *unicode)
224227
*(PyUnicodeObject **)unicode = unicode_freelist;
225228
unicode_freelist = unicode;
226229
unicode_freelist_size++;
227-
_Py_ForgetReference(unicode);
228230
}
229231
else {
230232
free(unicode->str);
@@ -489,7 +491,7 @@ int utf8_decoding_error(const char **source,
489491
}
490492
else {
491493
PyErr_Format(PyExc_ValueError,
492-
"UTF-8 decoding error; unkown error handling code: %s",
494+
"UTF-8 decoding error; unknown error handling code: %s",
493495
errors);
494496
return -1;
495497
}
@@ -611,7 +613,7 @@ int utf8_encoding_error(const Py_UNICODE **source,
611613
else {
612614
PyErr_Format(PyExc_ValueError,
613615
"UTF-8 encoding error; "
614-
"unkown error handling code: %s",
616+
"unknown error handling code: %s",
615617
errors);
616618
return -1;
617619
}
@@ -733,7 +735,7 @@ int utf16_decoding_error(const Py_UNICODE **source,
733735
}
734736
else {
735737
PyErr_Format(PyExc_ValueError,
736-
"UTF-16 decoding error; unkown error handling code: %s",
738+
"UTF-16 decoding error; unknown error handling code: %s",
737739
errors);
738740
return -1;
739741
}
@@ -921,7 +923,7 @@ int unicodeescape_decoding_error(const char **source,
921923
else {
922924
PyErr_Format(PyExc_ValueError,
923925
"Unicode-Escape decoding error; "
924-
"unkown error handling code: %s",
926+
"unknown error handling code: %s",
925927
errors);
926928
return -1;
927929
}
@@ -1051,6 +1053,10 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
10511053
10521054
*/
10531055

1056+
static const Py_UNICODE *findchar(const Py_UNICODE *s,
1057+
int size,
1058+
Py_UNICODE ch);
1059+
10541060
static
10551061
PyObject *unicodeescape_string(const Py_UNICODE *s,
10561062
int size,
@@ -1069,9 +1075,6 @@ PyObject *unicodeescape_string(const Py_UNICODE *s,
10691075
p = q = PyString_AS_STRING(repr);
10701076

10711077
if (quotes) {
1072-
static const Py_UNICODE *findchar(const Py_UNICODE *s,
1073-
int size,
1074-
Py_UNICODE ch);
10751078
*p++ = 'u';
10761079
*p++ = (findchar(s, size, '\'') &&
10771080
!findchar(s, size, '"')) ? '"' : '\'';
@@ -1298,7 +1301,7 @@ int latin1_encoding_error(const Py_UNICODE **source,
12981301
else {
12991302
PyErr_Format(PyExc_ValueError,
13001303
"Latin-1 encoding error; "
1301-
"unkown error handling code: %s",
1304+
"unknown error handling code: %s",
13021305
errors);
13031306
return -1;
13041307
}
@@ -1369,7 +1372,7 @@ int ascii_decoding_error(const char **source,
13691372
else {
13701373
PyErr_Format(PyExc_ValueError,
13711374
"ASCII decoding error; "
1372-
"unkown error handling code: %s",
1375+
"unknown error handling code: %s",
13731376
errors);
13741377
return -1;
13751378
}
@@ -1431,7 +1434,7 @@ int ascii_encoding_error(const Py_UNICODE **source,
14311434
else {
14321435
PyErr_Format(PyExc_ValueError,
14331436
"ASCII encoding error; "
1434-
"unkown error handling code: %s",
1437+
"unknown error handling code: %s",
14351438
errors);
14361439
return -1;
14371440
}
@@ -1502,7 +1505,7 @@ int charmap_decoding_error(const char **source,
15021505
else {
15031506
PyErr_Format(PyExc_ValueError,
15041507
"charmap decoding error; "
1505-
"unkown error handling code: %s",
1508+
"unknown error handling code: %s",
15061509
errors);
15071510
return -1;
15081511
}
@@ -1618,7 +1621,7 @@ int charmap_encoding_error(const Py_UNICODE **source,
16181621
else {
16191622
PyErr_Format(PyExc_ValueError,
16201623
"charmap encoding error; "
1621-
"unkown error handling code: %s",
1624+
"unknown error handling code: %s",
16221625
errors);
16231626
return -1;
16241627
}
@@ -1750,7 +1753,7 @@ int translate_error(const Py_UNICODE **source,
17501753
else {
17511754
PyErr_Format(PyExc_ValueError,
17521755
"translate error; "
1753-
"unkown error handling code: %s",
1756+
"unknown error handling code: %s",
17541757
errors);
17551758
return -1;
17561759
}

Python/codecs.c

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,9 +93,14 @@ PyObject *lowercasestring(const char *string)
9393

9494
PyObject *_PyCodec_Lookup(const char *encoding)
9595
{
96-
PyObject *result, *args = NULL, *v;
96+
PyObject *result, *args = NULL, *v = NULL;
9797
int i, len;
9898

99+
if (_PyCodec_SearchCache == NULL || _PyCodec_SearchPath == NULL) {
100+
PyErr_SetString(PyExc_SystemError,
101+
"codec module not properly initialized");
102+
goto onError;
103+
}
99104
if (!import_encodings_called)
100105
import_encodings();
101106

@@ -109,6 +114,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)
109114
result = PyDict_GetItem(_PyCodec_SearchCache, v);
110115
if (result != NULL) {
111116
Py_INCREF(result);
117+
Py_DECREF(v);
112118
return result;
113119
}
114120

@@ -121,6 +127,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)
121127
if (args == NULL)
122128
goto onError;
123129
PyTuple_SET_ITEM(args,0,v);
130+
v = NULL;
124131

125132
for (i = 0; i < len; i++) {
126133
PyObject *func;
@@ -146,7 +153,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)
146153
if (i == len) {
147154
/* XXX Perhaps we should cache misses too ? */
148155
PyErr_SetString(PyExc_LookupError,
149-
"unkown encoding");
156+
"unknown encoding");
150157
goto onError;
151158
}
152159

@@ -156,6 +163,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)
156163
return result;
157164

158165
onError:
166+
Py_XDECREF(v);
159167
Py_XDECREF(args);
160168
return NULL;
161169
}
@@ -378,5 +386,7 @@ void _PyCodecRegistry_Init()
378386
void _PyCodecRegistry_Fini()
379387
{
380388
Py_XDECREF(_PyCodec_SearchPath);
389+
_PyCodec_SearchPath = NULL;
381390
Py_XDECREF(_PyCodec_SearchCache);
391+
_PyCodec_SearchCache = NULL;
382392
}

0 commit comments

Comments
 (0)