Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit d8855fd

Browse files
committed
Marc-Andre Lemburg:
Attached you find the latest update of the Unicode implementation. The patch is against the current CVS version. It includes the fix I posted yesterday for the core dump problem in codecs.c (was introduced by my previous patch set -- sorry), adds more tests for the codecs and two new parser markers "es" and "es#".
1 parent 27fc3c0 commit d8855fd

5 files changed

Lines changed: 259 additions & 6 deletions

File tree

Lib/codecs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ class Codec:
4646
handling schemes by providing the errors argument. These
4747
string values are defined:
4848
49-
'strict' - raise an error (or a subclass)
49+
'strict' - raise a ValueError error (or a subclass)
5050
'ignore' - ignore the character and continue with the next
5151
'replace' - replace with a suitable replacement character;
5252
Python will use the official U+FFFD REPLACEMENT

Lib/test/output/test_unicode

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
test_unicode
22
Testing Unicode comparisons... done.
3-
Testing Unicode contains method... done.
43
Testing Unicode formatting strings... done.
54
Testing unicodedata module... done.

Lib/test/test_unicode.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,3 +293,33 @@ def __init__(self): self.seq = [7, u'hello', 123L]
293293
assert unicodedata.combining(u'\u20e1') == 230
294294

295295
print 'done.'
296+
297+
# Test builtin codecs
298+
print 'Testing builtin codecs...',
299+
300+
assert unicode('hello','ascii') == u'hello'
301+
assert unicode('hello','utf-8') == u'hello'
302+
assert unicode('hello','utf8') == u'hello'
303+
assert unicode('hello','latin-1') == u'hello'
304+
305+
assert u'hello'.encode('ascii') == 'hello'
306+
assert u'hello'.encode('utf-8') == 'hello'
307+
assert u'hello'.encode('utf8') == 'hello'
308+
assert u'hello'.encode('utf-16-le') == 'h\000e\000l\000l\000o\000'
309+
assert u'hello'.encode('utf-16-be') == '\000h\000e\000l\000l\000o'
310+
assert u'hello'.encode('latin-1') == 'hello'
311+
312+
u = u''.join(map(unichr, range(1024)))
313+
for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
314+
'raw_unicode_escape', 'unicode_escape', 'unicode_internal'):
315+
assert unicode(u.encode(encoding),encoding) == u
316+
317+
u = u''.join(map(unichr, range(256)))
318+
for encoding in ('latin-1',):
319+
assert unicode(u.encode(encoding),encoding) == u
320+
321+
u = u''.join(map(unichr, range(128)))
322+
for encoding in ('ascii',):
323+
assert unicode(u.encode(encoding),encoding) == u
324+
325+
print 'done.'

Misc/unicode.txt

Lines changed: 110 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -715,21 +715,126 @@ Internal Argument Parsing:
715715

716716
These markers are used by the PyArg_ParseTuple() APIs:
717717

718-
'U': Check for Unicode object and return a pointer to it
718+
"U": Check for Unicode object and return a pointer to it
719719

720-
's': For Unicode objects: auto convert them to the <default encoding>
720+
"s": For Unicode objects: auto convert them to the <default encoding>
721721
and return a pointer to the object's <defencstr> buffer.
722722

723-
's#': Access to the Unicode object via the bf_getreadbuf buffer interface
723+
"s#": Access to the Unicode object via the bf_getreadbuf buffer interface
724724
(see Buffer Interface); note that the length relates to the buffer
725725
length, not the Unicode string length (this may be different
726726
depending on the Internal Format).
727727

728-
't#': Access to the Unicode object via the bf_getcharbuf buffer interface
728+
"t#": Access to the Unicode object via the bf_getcharbuf buffer interface
729729
(see Buffer Interface); note that the length relates to the buffer
730730
length, not necessarily to the Unicode string length (this may
731731
be different depending on the <default encoding>).
732732

733+
"es":
734+
Takes two parameters: encoding (const char *) and
735+
buffer (char **).
736+
737+
The input object is first coerced to Unicode in the usual way
738+
and then encoded into a string using the given encoding.
739+
740+
On output, a buffer of the needed size is allocated and
741+
returned through *buffer as NULL-terminated string.
742+
The encoded may not contain embedded NULL characters.
743+
The caller is responsible for free()ing the allocated *buffer
744+
after usage.
745+
746+
"es#":
747+
Takes three parameters: encoding (const char *),
748+
buffer (char **) and buffer_len (int *).
749+
750+
The input object is first coerced to Unicode in the usual way
751+
and then encoded into a string using the given encoding.
752+
753+
If *buffer is non-NULL, *buffer_len must be set to sizeof(buffer)
754+
on input. Output is then copied to *buffer.
755+
756+
If *buffer is NULL, a buffer of the needed size is
757+
allocated and output copied into it. *buffer is then
758+
updated to point to the allocated memory area. The caller
759+
is responsible for free()ing *buffer after usage.
760+
761+
In both cases *buffer_len is updated to the number of
762+
characters written (excluding the trailing NULL-byte).
763+
The output buffer is assured to be NULL-terminated.
764+
765+
Examples:
766+
767+
Using "es#" with auto-allocation:
768+
769+
static PyObject *
770+
test_parser(PyObject *self,
771+
PyObject *args)
772+
{
773+
PyObject *str;
774+
const char *encoding = "latin-1";
775+
char *buffer = NULL;
776+
int buffer_len = 0;
777+
778+
if (!PyArg_ParseTuple(args, "es#:test_parser",
779+
encoding, &buffer, &buffer_len))
780+
return NULL;
781+
if (!buffer) {
782+
PyErr_SetString(PyExc_SystemError,
783+
"buffer is NULL");
784+
return NULL;
785+
}
786+
str = PyString_FromStringAndSize(buffer, buffer_len);
787+
free(buffer);
788+
return str;
789+
}
790+
791+
Using "es" with auto-allocation returning a NULL-terminated string:
792+
793+
static PyObject *
794+
test_parser(PyObject *self,
795+
PyObject *args)
796+
{
797+
PyObject *str;
798+
const char *encoding = "latin-1";
799+
char *buffer = NULL;
800+
801+
if (!PyArg_ParseTuple(args, "es:test_parser",
802+
encoding, &buffer))
803+
return NULL;
804+
if (!buffer) {
805+
PyErr_SetString(PyExc_SystemError,
806+
"buffer is NULL");
807+
return NULL;
808+
}
809+
str = PyString_FromString(buffer);
810+
free(buffer);
811+
return str;
812+
}
813+
814+
Using "es#" with a pre-allocated buffer:
815+
816+
static PyObject *
817+
test_parser(PyObject *self,
818+
PyObject *args)
819+
{
820+
PyObject *str;
821+
const char *encoding = "latin-1";
822+
char _buffer[10];
823+
char *buffer = _buffer;
824+
int buffer_len = sizeof(_buffer);
825+
826+
if (!PyArg_ParseTuple(args, "es#:test_parser",
827+
encoding, &buffer, &buffer_len))
828+
return NULL;
829+
if (!buffer) {
830+
PyErr_SetString(PyExc_SystemError,
831+
"buffer is NULL");
832+
return NULL;
833+
}
834+
str = PyString_FromStringAndSize(buffer, buffer_len);
835+
return str;
836+
}
837+
733838

734839
File/Stream Output:
735840
-------------------
@@ -837,6 +942,7 @@ Encodings:
837942

838943
History of this Proposal:
839944
-------------------------
945+
1.3: Added new "es" and "es#" parser markers
840946
1.2: Removed POD about codecs.open()
841947
1.1: Added note about comparisons and hash values. Added note about
842948
case mapping algorithms. Changed stream codecs .read() and

Python/getargs.c

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,8 @@ vgetargs1(args, format, p_va, compat)
178178
}
179179
else if (level != 0)
180180
; /* Pass */
181+
else if (c == 'e')
182+
; /* Pass */
181183
else if (isalpha(c))
182184
max++;
183185
else if (c == '|')
@@ -654,6 +656,122 @@ convertsimple1(arg, p_format, p_va)
654656
break;
655657
}
656658

659+
case 'e': /* encoded string */
660+
{
661+
char **buffer;
662+
const char *encoding;
663+
PyObject *u, *s;
664+
int size;
665+
666+
/* Get 'e' parameter: the encoding name */
667+
encoding = (const char *)va_arg(*p_va, const char *);
668+
if (encoding == NULL)
669+
return "(encoding is NULL)";
670+
671+
/* Get 's' parameter: the output buffer to use */
672+
if (*format != 's')
673+
return "(unkown parser marker combination)";
674+
buffer = (char **)va_arg(*p_va, char **);
675+
format++;
676+
if (buffer == NULL)
677+
return "(buffer is NULL)";
678+
679+
/* Convert object to Unicode */
680+
u = PyUnicode_FromObject(arg);
681+
if (u == NULL)
682+
return "string, unicode or text buffer";
683+
684+
/* Encode object; use default error handling */
685+
s = PyUnicode_AsEncodedString(u,
686+
encoding,
687+
NULL);
688+
Py_DECREF(u);
689+
if (s == NULL)
690+
return "(encoding failed)";
691+
if (!PyString_Check(s)) {
692+
Py_DECREF(s);
693+
return "(encoder failed to return a string)";
694+
}
695+
size = PyString_GET_SIZE(s);
696+
697+
/* Write output; output is guaranteed to be
698+
0-terminated */
699+
if (*format == '#') {
700+
/* Using buffer length parameter '#':
701+
702+
- if *buffer is NULL, a new buffer
703+
of the needed size is allocated and
704+
the data copied into it; *buffer is
705+
updated to point to the new buffer;
706+
the caller is responsible for
707+
free()ing it after usage
708+
709+
- if *buffer is not NULL, the data
710+
is copied to *buffer; *buffer_len
711+
has to be set to the size of the
712+
buffer on input; buffer overflow is
713+
signalled with an error; buffer has
714+
to provide enough room for the
715+
encoded string plus the trailing
716+
0-byte
717+
718+
- in both cases, *buffer_len is
719+
updated to the size of the buffer
720+
/excluding/ the trailing 0-byte
721+
722+
*/
723+
int *buffer_len = va_arg(*p_va, int *);
724+
725+
format++;
726+
if (buffer_len == NULL)
727+
return "(buffer_len is NULL)";
728+
if (*buffer == NULL) {
729+
*buffer = PyMem_NEW(char, size + 1);
730+
if (*buffer == NULL) {
731+
Py_DECREF(s);
732+
return "(memory error)";
733+
}
734+
} else {
735+
if (size + 1 > *buffer_len) {
736+
Py_DECREF(s);
737+
return "(buffer overflow)";
738+
}
739+
}
740+
memcpy(*buffer,
741+
PyString_AS_STRING(s),
742+
size + 1);
743+
*buffer_len = size;
744+
} else {
745+
/* Using a 0-terminated buffer:
746+
747+
- the encoded string has to be
748+
0-terminated for this variant to
749+
work; if it is not, an error raised
750+
751+
- a new buffer of the needed size
752+
is allocated and the data copied
753+
into it; *buffer is updated to
754+
point to the new buffer; the caller
755+
is responsible for free()ing it
756+
after usage
757+
758+
*/
759+
if (strlen(PyString_AS_STRING(s)) != size)
760+
return "(encoded string without "\
761+
"NULL bytes)";
762+
*buffer = PyMem_NEW(char, size + 1);
763+
if (*buffer == NULL) {
764+
Py_DECREF(s);
765+
return "(memory error)";
766+
}
767+
memcpy(*buffer,
768+
PyString_AS_STRING(s),
769+
size + 1);
770+
}
771+
Py_DECREF(s);
772+
break;
773+
}
774+
657775
case 'S': /* string object */
658776
{
659777
PyObject **p = va_arg(*p_va, PyObject **);

0 commit comments

Comments
 (0)