reorganized PyUnicode_DecodeUnicodeEscape a bit (in order to make it

Fredrik Lundh · Fredrik Lundh · commit ccc7473fc859 · 2001-02-18T22:13:49.000Z
less likely that bug #132817 ever appears again)
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
@@ -1110,10 +1110,11 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
 					const char *errors)
 {
     PyUnicodeObject *v;
-    Py_UNICODE *p = NULL, *buf = NULL;
+    Py_UNICODE *p, *buf;
     const char *end;
-    Py_UCS4 chr;
-    
+    char* message;
+    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
+
     /* Escaped strings will always be longer than the resulting
        Unicode string, so we start with size here and then reduce the
        length after conversion to the true value. */
@@ -1122,16 +1123,18 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
         goto onError;
     if (size == 0)
         return (PyObject *)v;
+
     p = buf = PyUnicode_AS_UNICODE(v);
     end = s + size;
+
     while (s < end) {
         unsigned char c;
         Py_UNICODE x;
-        int i;
+        int i, digits;
 
         /* Non-escape characters are interpreted as Unicode ordinals */
         if (*s != '\\') {
-            *p++ = (unsigned char)*s++;
+            *p++ = (unsigned char) *s++;
             continue;
         }
 
@@ -1164,60 +1167,31 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
             *p++ = x;
             break;
 
-        /* \xXX with two hex digits */
+        /* hex escapes */
+        /* \xXX */
         case 'x':
-            for (x = 0, i = 0; i < 2; i++) {
-                c = (unsigned char)s[i];
-                if (!isxdigit(c)) {
-                    if (unicodeescape_decoding_error(&s, &x, errors,
-                                                     "truncated \\xXX"))
-                        goto onError;
-                    i++;
-                    break;
-                }
-                x = (x<<4) & ~0xF;
-                if (c >= '0' && c <= '9')
-                    x += c - '0';
-                else if (c >= 'a' && c <= 'f')
-                    x += 10 + c - 'a';
-                else
-                    x += 10 + c - 'A';
-            }
-            s += i;
-            *p++ = x;
-            break;
+            digits = 2;
+            message = "truncated \\xXX escape";
+            goto hexescape;
 
-        /* \uXXXX with 4 hex digits */
+        /* \uXXXX */
         case 'u':
-            for (x = 0, i = 0; i < 4; i++) {
-                c = (unsigned char)s[i];
-                if (!isxdigit(c)) {
-                    if (unicodeescape_decoding_error(&s, &x, errors,
-                                                     "truncated \\uXXXX"))
-                        goto onError;
-                    i++;
-                    break;
-                }
-                x = (x<<4) & ~0xF;
-                if (c >= '0' && c <= '9')
-                    x += c - '0';
-                else if (c >= 'a' && c <= 'f')
-                    x += 10 + c - 'a';
-                else
-                    x += 10 + c - 'A';
-            }
-            s += i;
-            *p++ = x;
-            break;
+            digits = 4;
+            message = "truncated \\uXXXX escape";
+            goto hexescape;
 
-        /* \UXXXXXXXX with 8 hex digits */
+        /* \UXXXXXXXX */
         case 'U':
-            for (chr = 0, i = 0; i < 8; i++) {
-                c = (unsigned char)s[i];
+            digits = 8;
+            message = "truncated \\UXXXXXXXX escape";
+        hexescape:
+            chr = 0;
+            for (i = 0; i < digits; i++) {
+                c = (unsigned char) s[i];
                 if (!isxdigit(c)) {
-                    if (unicodeescape_decoding_error(&s, &x, errors,
-                                                     "truncated \\uXXXX"))
+                    if (unicodeescape_decoding_error(&s, &x, errors, message))
                         goto onError;
+                    chr = x;
                     i++;
                     break;
                 }
@@ -1230,95 +1204,80 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
                     chr += 10 + c - 'A';
             }
             s += i;
-            goto store;
+        store:
+            /* when we get here, chr is a 32-bit unicode character */
+            if (chr <= 0xffff)
+                /* UCS-2 character */
+                *p++ = (Py_UNICODE) chr;
+            else if (chr <= 0x10ffff) {
+                /* UCS-4 character.  store as two surrogate characters */
+                chr -= 0x10000L;
+                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
+                *p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
+            } else {
+                if (unicodeescape_decoding_error(
+                    &s, &x, errors,
+                    "illegal Unicode character")
+                    )
+                    goto onError;
+                *p++ = x; /* store replacement character */
+            }
+            break;
 
+        /* \N{name} */
         case 'N':
-            /* Ok, we need to deal with Unicode Character Names now,
-             * make sure we've imported the hash table data...
-             */
+            message = "malformed \\N character escape";
             if (ucnhash_CAPI == NULL) {
-                PyObject *mod = 0, *v = 0;
-                mod = PyImport_ImportModule("unicodedata");
-                if (mod == NULL)
+                /* load the unicode data module */
+                PyObject *m, *v;
+                m = PyImport_ImportModule("unicodedata");
+                if (m == NULL)
                     goto ucnhashError;
-                v = PyObject_GetAttrString(mod,"ucnhash_CAPI");
-                Py_DECREF(mod);
+                v = PyObject_GetAttrString(m, "ucnhash_CAPI");
+                Py_DECREF(m);
                 if (v == NULL)
                     goto ucnhashError;
                 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
                 Py_DECREF(v);
                 if (ucnhash_CAPI == NULL)
                     goto ucnhashError;
             }
-                
             if (*s == '{') {
-                const char *start = s + 1;
-                const char *endBrace = start;
-
+                const char *start = s+1;
                 /* look for the closing brace */
-                while (*endBrace != '}' && endBrace < end)
-                    endBrace++;
-                if (endBrace != end && *endBrace == '}') {
-                    if (!ucnhash_CAPI->getcode(start, endBrace-start, &chr)) {
-                        if (unicodeescape_decoding_error(
-                                &s, &x, errors,
-                                "Invalid Unicode Character Name")
-                            )
-                            goto onError;
-                        goto ucnFallthrough;
-                    }
-                    s = endBrace + 1;
-                    goto store;
-                } else {
-                    if (unicodeescape_decoding_error(
-                            &s, &x, errors,
-                            "Unicode name missing closing brace"))
-                        goto onError;
-                    goto ucnFallthrough;
+                while (*s != '}' && s < end)
+                    s++;
+                if (s > start && s < end && *s == '}') {
+                    /* found a name.  look it up in the unicode database */
+                    message = "unknown Unicode character name";
+                    s++;
+                    if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
+                        goto store;
                 }
-                break;                
             }
-            if (unicodeescape_decoding_error(
-                    &s, &x, errors,
-                    "Missing opening brace for Unicode Character Name escape"))
+            if (unicodeescape_decoding_error(&s, &x, errors, message))
                 goto onError;
-ucnFallthrough:
-            /* fall through on purpose */
-		default:
+            *p++ = x;
+            break;
+
+        default:
             *p++ = '\\';
             *p++ = (unsigned char)s[-1];
             break;
-store:
-            /* when we get here, chr is a 32-bit unicode character */
-            if (chr <= 0xffff)
-                /* UCS-2 character */
-                *p++ = (Py_UNICODE) chr;
-            else if (chr <= 0x10ffff) {
-                /* UCS-4 character.  store as two surrogate characters */
-                chr -= 0x10000L;
-                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
-                *p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
-            } else {
-                if (unicodeescape_decoding_error(
-                    &s, &x, errors,
-                    "Illegal Unicode character")
-                    )
-                    goto onError;
-            }
         }
     }
     if (_PyUnicode_Resize(v, (int)(p - buf)))
 		goto onError;
     return (PyObject *)v;
     
- ucnhashError:
+ucnhashError:
     PyErr_SetString(
         PyExc_UnicodeError,
         "\\N escapes not supported (can't load unicodedata module)"
         );
     return NULL;
 
- onError:
+onError:
     Py_XDECREF(v);
     return NULL;
 }