Marc-Andre Lemburg:

gvanrossum · gvanrossum · commit d8855fde885f · 2000-03-24T22:14:19.000Z
Attached you find the latest update of the Unicode implementation.
The patch is against the current CVS version.

It includes the fix I posted yesterday for the core dump problem
in codecs.c (was introduced by my previous patch set -- sorry),
adds more tests for the codecs and two new parser markers
"es" and "es#".
diff --git a/Lib/codecs.py b/Lib/codecs.py
@@ -46,7 +46,7 @@ class Codec:
         handling schemes by providing the errors argument. These
         string values are defined:
 
-         'strict' - raise an error (or a subclass)
+         'strict' - raise a ValueError error (or a subclass)
          'ignore' - ignore the character and continue with the next
          'replace' - replace with a suitable replacement character;
                     Python will use the official U+FFFD REPLACEMENT
diff --git a/Lib/test/output/test_unicode b/Lib/test/output/test_unicode
@@ -1,5 +1,4 @@
 test_unicode
 Testing Unicode comparisons... done.
-Testing Unicode contains method... done.
 Testing Unicode formatting strings... done.
 Testing unicodedata module... done.
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
@@ -293,3 +293,33 @@ def __init__(self): self.seq = [7, u'hello', 123L]
     assert unicodedata.combining(u'\u20e1') == 230
     
     print 'done.'
+
+# Test builtin codecs
+print 'Testing builtin codecs...',
+
+assert unicode('hello','ascii') == u'hello'
+assert unicode('hello','utf-8') == u'hello'
+assert unicode('hello','utf8') == u'hello'
+assert unicode('hello','latin-1') == u'hello'
+
+assert u'hello'.encode('ascii') == 'hello'
+assert u'hello'.encode('utf-8') == 'hello'
+assert u'hello'.encode('utf8') == 'hello'
+assert u'hello'.encode('utf-16-le') == 'h\000e\000l\000l\000o\000'
+assert u'hello'.encode('utf-16-be') == '\000h\000e\000l\000l\000o'
+assert u'hello'.encode('latin-1') == 'hello'
+
+u = u''.join(map(unichr, range(1024)))
+for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
+                 'raw_unicode_escape', 'unicode_escape', 'unicode_internal'):
+    assert unicode(u.encode(encoding),encoding) == u
+
+u = u''.join(map(unichr, range(256)))
+for encoding in ('latin-1',):
+    assert unicode(u.encode(encoding),encoding) == u
+
+u = u''.join(map(unichr, range(128)))
+for encoding in ('ascii',):
+    assert unicode(u.encode(encoding),encoding) == u
+
+print 'done.'
diff --git a/Misc/unicode.txt b/Misc/unicode.txt
@@ -715,21 +715,126 @@ Internal Argument Parsing:
 
 These markers are used by the PyArg_ParseTuple() APIs:
 
-  'U':  Check for Unicode object and return a pointer to it
+  "U":  Check for Unicode object and return a pointer to it
 
-  's':  For Unicode objects: auto convert them to the <default encoding>
+  "s":  For Unicode objects: auto convert them to the <default encoding>
         and return a pointer to the object's <defencstr> buffer.
 
-  's#': Access to the Unicode object via the bf_getreadbuf buffer interface 
+  "s#": Access to the Unicode object via the bf_getreadbuf buffer interface 
         (see Buffer Interface); note that the length relates to the buffer
         length, not the Unicode string length (this may be different
         depending on the Internal Format).
 
-  't#': Access to the Unicode object via the bf_getcharbuf buffer interface
+  "t#": Access to the Unicode object via the bf_getcharbuf buffer interface
         (see Buffer Interface); note that the length relates to the buffer
         length, not necessarily to the Unicode string length (this may
         be different depending on the <default encoding>).
 
+  "es": 
+	Takes two parameters: encoding (const char *) and
+	buffer (char **). 
+
+	The input object is first coerced to Unicode in the usual way
+	and then encoded into a string using the given encoding.
+
+	On output, a buffer of the needed size is allocated and
+	returned through *buffer as NULL-terminated string.
+	The encoded may not contain embedded NULL characters.
+	The caller is responsible for free()ing the allocated *buffer
+	after usage.
+
+  "es#":
+	Takes three parameters: encoding (const char *),
+	buffer (char **) and buffer_len (int *).
+	
+	The input object is first coerced to Unicode in the usual way
+	and then encoded into a string using the given encoding.
+
+	If *buffer is non-NULL, *buffer_len must be set to sizeof(buffer)
+	on input. Output is then copied to *buffer.
+
+	If *buffer is NULL, a buffer of the needed size is
+	allocated and output copied into it. *buffer is then
+	updated to point to the allocated memory area. The caller
+	is responsible for free()ing *buffer after usage.
+
+	In both cases *buffer_len is updated to the number of
+	characters written (excluding the trailing NULL-byte).
+	The output buffer is assured to be NULL-terminated.
+
+Examples:
+
+Using "es#" with auto-allocation:
+
+    static PyObject *
+    test_parser(PyObject *self,
+		PyObject *args)
+    {
+	PyObject *str;
+	const char *encoding = "latin-1";
+	char *buffer = NULL;
+	int buffer_len = 0;
+
+	if (!PyArg_ParseTuple(args, "es#:test_parser",
+			      encoding, &buffer, &buffer_len))
+	    return NULL;
+	if (!buffer) {
+	    PyErr_SetString(PyExc_SystemError,
+			    "buffer is NULL");
+	    return NULL;
+	}
+	str = PyString_FromStringAndSize(buffer, buffer_len);
+	free(buffer);
+	return str;
+    }
+
+Using "es" with auto-allocation returning a NULL-terminated string:    
+    
+    static PyObject *
+    test_parser(PyObject *self,
+		PyObject *args)
+    {
+	PyObject *str;
+	const char *encoding = "latin-1";
+	char *buffer = NULL;
+
+	if (!PyArg_ParseTuple(args, "es:test_parser",
+			      encoding, &buffer))
+	    return NULL;
+	if (!buffer) {
+	    PyErr_SetString(PyExc_SystemError,
+			    "buffer is NULL");
+	    return NULL;
+	}
+	str = PyString_FromString(buffer);
+	free(buffer);
+	return str;
+    }
+
+Using "es#" with a pre-allocated buffer:
+    
+    static PyObject *
+    test_parser(PyObject *self,
+		PyObject *args)
+    {
+	PyObject *str;
+	const char *encoding = "latin-1";
+	char _buffer[10];
+	char *buffer = _buffer;
+	int buffer_len = sizeof(_buffer);
+
+	if (!PyArg_ParseTuple(args, "es#:test_parser",
+			      encoding, &buffer, &buffer_len))
+	    return NULL;
+	if (!buffer) {
+	    PyErr_SetString(PyExc_SystemError,
+			    "buffer is NULL");
+	    return NULL;
+	}
+	str = PyString_FromStringAndSize(buffer, buffer_len);
+	return str;
+    }
+
 
 File/Stream Output:
 -------------------
@@ -837,6 +942,7 @@ Encodings:
 
 History of this Proposal:
 -------------------------
+1.3: Added new "es" and "es#" parser markers
 1.2: Removed POD about codecs.open()
 1.1: Added note about comparisons and hash values. Added note about
      case mapping algorithms. Changed stream codecs .read() and
diff --git a/Python/getargs.c b/Python/getargs.c
@@ -178,6 +178,8 @@ vgetargs1(args, format, p_va, compat)
 		}
 		else if (level != 0)
 			; /* Pass */
+		else if (c == 'e')
+			; /* Pass */
 		else if (isalpha(c))
 			max++;
 		else if (c == '|')
@@ -654,6 +656,122 @@ convertsimple1(arg, p_format, p_va)
 			break;
 		}
 	
+	case 'e': /* encoded string */
+		{
+			char **buffer;
+			const char *encoding;
+			PyObject *u, *s;
+			int size;
+
+			/* Get 'e' parameter: the encoding name */
+			encoding = (const char *)va_arg(*p_va, const char *);
+			if (encoding == NULL)
+				return "(encoding is NULL)";
+			
+			/* Get 's' parameter: the output buffer to use */
+			if (*format != 's')
+				return "(unkown parser marker combination)";
+			buffer = (char **)va_arg(*p_va, char **);
+			format++;
+			if (buffer == NULL)
+				return "(buffer is NULL)";
+			
+			/* Convert object to Unicode */
+			u = PyUnicode_FromObject(arg);
+			if (u == NULL)
+				return "string, unicode or text buffer";
+			
+			/* Encode object; use default error handling */
+			s = PyUnicode_AsEncodedString(u,
+						      encoding,
+						      NULL);
+			Py_DECREF(u);
+			if (s == NULL)
+				return "(encoding failed)";
+			if (!PyString_Check(s)) {
+				Py_DECREF(s);
+				return "(encoder failed to return a string)";
+			}
+			size = PyString_GET_SIZE(s);
+
+			/* Write output; output is guaranteed to be
+			   0-terminated */
+			if (*format == '#') { 
+				/* Using buffer length parameter '#':
+
+				   - if *buffer is NULL, a new buffer
+				   of the needed size is allocated and
+				   the data copied into it; *buffer is
+				   updated to point to the new buffer;
+				   the caller is responsible for
+				   free()ing it after usage
+
+				   - if *buffer is not NULL, the data
+				   is copied to *buffer; *buffer_len
+				   has to be set to the size of the
+				   buffer on input; buffer overflow is
+				   signalled with an error; buffer has
+				   to provide enough room for the
+				   encoded string plus the trailing
+				   0-byte
+
+				   - in both cases, *buffer_len is
+				   updated to the size of the buffer
+				   /excluding/ the trailing 0-byte
+
+				*/
+				int *buffer_len = va_arg(*p_va, int *);
+
+				format++;
+				if (buffer_len == NULL)
+					return "(buffer_len is NULL)";
+				if (*buffer == NULL) {
+					*buffer = PyMem_NEW(char, size + 1);
+					if (*buffer == NULL) {
+						Py_DECREF(s);
+						return "(memory error)";
+					}
+				} else {
+					if (size + 1 > *buffer_len) {
+						Py_DECREF(s);
+						return "(buffer overflow)";
+					}
+				}
+				memcpy(*buffer,
+				       PyString_AS_STRING(s),
+				       size + 1);
+				*buffer_len = size;
+			} else {
+				/* Using a 0-terminated buffer:
+
+				   - the encoded string has to be
+				   0-terminated for this variant to
+				   work; if it is not, an error raised
+
+				   - a new buffer of the needed size
+				   is allocated and the data copied
+				   into it; *buffer is updated to
+				   point to the new buffer; the caller
+				   is responsible for free()ing it
+				   after usage
+
+				 */
+				if (strlen(PyString_AS_STRING(s)) != size)
+					return "(encoded string without "\
+					       "NULL bytes)";
+				*buffer = PyMem_NEW(char, size + 1);
+				if (*buffer == NULL) {
+					Py_DECREF(s);
+					return "(memory error)";
+				}
+				memcpy(*buffer,
+				       PyString_AS_STRING(s),
+				       size + 1);
+			}
+			Py_DECREF(s);
+			break;
+		}
+
 	case 'S': /* string object */
 		{
 			PyObject **p = va_arg(*p_va, PyObject **);