Optimize error handlers of ASCII and Latin1 encoders when the replacement

vstinner · vstinner · commit 6bd525b656f7 · 2015-10-09T13:10:05.000+02:00
string is pure ASCII: use _PyBytesWriter_WriteBytes(), don't check individual
character.

Cleanup unicode_encode_ucs1():

* Rename repunicode to rep
* Clear rep object on error
* Factorize code between bytes and unicode path
diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h
@@ -311,7 +311,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
 #if STRINGLIB_SIZEOF_CHAR > 1
         else if (Py_UNICODE_IS_SURROGATE(ch)) {
             Py_ssize_t startpos, endpos, newpos;
-            Py_ssize_t repsize, k;
+            Py_ssize_t k;
             if (error_handler == _Py_ERROR_UNKNOWN)
                 error_handler = get_error_handler(errors);
 
@@ -392,20 +392,12 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
                     p = _PyBytesWriter_WriteBytes(&writer, p,
                                                   PyBytes_AS_STRING(rep),
                                                   PyBytes_GET_SIZE(rep));
-                    if (p == NULL)
-                        goto error;
                 }
                 else {
                     /* rep is unicode */
                     if (PyUnicode_READY(rep) < 0)
                         goto error;
 
-                    repsize = PyUnicode_GET_LENGTH(rep);
-
-                    p = _PyBytesWriter_Prepare(&writer, p, repsize);
-                    if (p == NULL)
-                        goto error;
-
                     if (!PyUnicode_IS_ASCII(rep)) {
                         raise_encode_exception(&exc, "utf-8",
                                                unicode,
@@ -415,9 +407,13 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
                     }
 
                     assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
-                    memcpy(p, PyUnicode_DATA(rep), repsize);
-                    p += repsize;
+                    p = _PyBytesWriter_WriteBytes(&writer, p,
+                                                  PyUnicode_DATA(rep),
+                                                  PyUnicode_GET_LENGTH(rep));
                 }
+
+                if (p == NULL)
+                    goto error;
                 Py_CLEAR(rep);
 
                 i = newpos;
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
@@ -6599,6 +6599,7 @@ unicode_encode_ucs1(PyObject *unicode,
     PyObject *error_handler_obj = NULL;
     PyObject *exc = NULL;
     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
+    PyObject *rep = NULL;
     /* output object */
     _PyBytesWriter writer;
 
@@ -6627,8 +6628,7 @@ unicode_encode_ucs1(PyObject *unicode,
             ++pos;
         }
         else {
-            PyObject *repunicode;
-            Py_ssize_t repsize, newpos, i;
+            Py_ssize_t newpos, i;
             /* startpos for collecting unencodable chars */
             Py_ssize_t collstart = pos;
             Py_ssize_t collend = collstart + 1;
@@ -6694,52 +6694,59 @@ unicode_encode_ucs1(PyObject *unicode,
                 /* fallback to general error handling */
 
             default:
-                repunicode = unicode_encode_call_errorhandler(errors, &error_handler_obj,
-                                                              encoding, reason, unicode, &exc,
-                                                              collstart, collend, &newpos);
-                if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
-                                           PyUnicode_READY(repunicode) == -1))
+                rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
+                                                       encoding, reason, unicode, &exc,
+                                                       collstart, collend, &newpos);
+                if (rep == NULL)
                     goto onError;
 
                 /* substract preallocated bytes */
                 writer.min_size -= 1;
 
-                if (PyBytes_Check(repunicode)) {
+                if (PyBytes_Check(rep)) {
                     /* Directly copy bytes result to output. */
                     str = _PyBytesWriter_WriteBytes(&writer, str,
-                                                    PyBytes_AS_STRING(repunicode),
-                                                    PyBytes_GET_SIZE(repunicode));
+                                                    PyBytes_AS_STRING(rep),
+                                                    PyBytes_GET_SIZE(rep));
                     if (str == NULL)
                         goto onError;
-
-                    pos = newpos;
-                    Py_DECREF(repunicode);
-                    break;
                 }
+                else {
+                    assert(PyUnicode_Check(rep));
 
-                /* need more space? (at least enough for what we
-                   have+the replacement+the rest of the string, so
-                   we won't have to check space for encodable characters) */
-                repsize = PyUnicode_GET_LENGTH(repunicode);
+                    if (PyUnicode_READY(rep) < 0)
+                        goto onError;
 
-                str = _PyBytesWriter_Prepare(&writer, str, repsize);
-                if (str == NULL)
-                    goto onError;
+                    if (PyUnicode_IS_ASCII(rep)) {
+                        /* Fast path: all characters are smaller than limit */
+                        assert(limit >= 128);
+                        assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
+                        str = _PyBytesWriter_WriteBytes(&writer, str,
+                                                        PyUnicode_DATA(rep),
+                                                        PyUnicode_GET_LENGTH(rep));
+                    }
+                    else {
+                        Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep);
 
-                /* check if there is anything unencodable in the replacement
-                   and copy it to the output */
-                for (i = 0; repsize-->0; ++i, ++str) {
-                    ch = PyUnicode_READ_CHAR(repunicode, i);
-                    if (ch >= limit) {
-                        raise_encode_exception(&exc, encoding, unicode,
-                                               pos, pos+1, reason);
-                        Py_DECREF(repunicode);
-                        goto onError;
+                        str = _PyBytesWriter_Prepare(&writer, str, repsize);
+                        if (str == NULL)
+                            goto onError;
+
+                        /* check if there is anything unencodable in the
+                           replacement and copy it to the output */
+                        for (i = 0; repsize-->0; ++i, ++str) {
+                            ch = PyUnicode_READ_CHAR(rep, i);
+                            if (ch >= limit) {
+                                raise_encode_exception(&exc, encoding, unicode,
+                                                       pos, pos+1, reason);
+                                goto onError;
+                            }
+                            *str = (char)ch;
+                        }
                     }
-                    *str = (char)ch;
                 }
                 pos = newpos;
-                Py_DECREF(repunicode);
+                Py_CLEAR(rep);
             }
 
             /* If overallocation was disabled, ensure that it was the last
@@ -6753,6 +6760,7 @@ unicode_encode_ucs1(PyObject *unicode,
     return _PyBytesWriter_Finish(&writer, str);
 
   onError:
+    Py_XDECREF(rep);
     _PyBytesWriter_Dealloc(&writer);
     Py_XDECREF(error_handler_obj);
     Py_XDECREF(exc);