Issue #16330: Use surrogate-related macros

vstinner · vstinner · commit 76df43de30f4 · 2012-10-30T01:42:39.000+01:00
Patch written by Serhiy Storchaka.
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
@@ -180,9 +180,9 @@ typedef unsigned char Py_UCS1;
     } while (0)
 
 /* macros to work with surrogates */
-#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF)
-#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
-#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
+#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDFFF)
+#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDBFF)
+#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= (ch) && (ch) <= 0xDFFF)
 /* Join two surrogate characters and return a single Py_UCS4 value. */
 #define Py_UNICODE_JOIN_SURROGATES(high, low)  \
     (((((Py_UCS4)(high) & 0x03FF) << 10) |      \
diff --git a/Modules/_json.c b/Modules/_json.c
@@ -174,14 +174,13 @@ ascii_escape_unichar(Py_UCS4 c, unsigned char *output, Py_ssize_t chars)
         default:
             if (c >= 0x10000) {
                 /* UTF-16 surrogate pair */
-                Py_UCS4 v = c - 0x10000;
-                c = 0xd800 | ((v >> 10) & 0x3ff);
+                Py_UCS4 v = Py_UNICODE_HIGH_SURROGATE(c);
                 output[chars++] = 'u';
-                output[chars++] = Py_hexdigits[(c >> 12) & 0xf];
-                output[chars++] = Py_hexdigits[(c >>  8) & 0xf];
-                output[chars++] = Py_hexdigits[(c >>  4) & 0xf];
-                output[chars++] = Py_hexdigits[(c      ) & 0xf];
-                c = 0xdc00 | (v & 0x3ff);
+                output[chars++] = Py_hexdigits[(v >> 12) & 0xf];
+                output[chars++] = Py_hexdigits[(v >>  8) & 0xf];
+                output[chars++] = Py_hexdigits[(v >>  4) & 0xf];
+                output[chars++] = Py_hexdigits[(v      ) & 0xf];
+                c = Py_UNICODE_LOW_SURROGATE(c);
                 output[chars++] = '\\';
             }
             output[chars++] = 'u';
@@ -431,7 +430,7 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
                 }
             }
             /* Surrogate pair */
-            if ((c & 0xfc00) == 0xd800) {
+            if (Py_UNICODE_IS_HIGH_SURROGATE(c)) {
                 Py_UCS4 c2 = 0;
                 if (end + 6 >= len) {
                     raise_errmsg("Unpaired high surrogate", pystr, end - 5);
@@ -462,13 +461,13 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
                             goto bail;
                     }
                 }
-                if ((c2 & 0xfc00) != 0xdc00) {
+                if (!Py_UNICODE_IS_LOW_SURROGATE(c2)) {
                     raise_errmsg("Unpaired high surrogate", pystr, end - 5);
                     goto bail;
                 }
-                c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
+                c = Py_UNICODE_JOIN_SURROGATES(c, c2);
             }
-            else if ((c & 0xfc00) == 0xdc00) {
+            else if (Py_UNICODE_IS_LOW_SURROGATE(c)) {
                 raise_errmsg("Unpaired low surrogate", pystr, end - 5);
                 goto bail;
             }
diff --git a/Modules/cjkcodecs/cjkcodecs.h b/Modules/cjkcodecs/cjkcodecs.h
@@ -148,8 +148,8 @@ static const struct dbcs_map *mapping_list;
 #if Py_UNICODE_SIZE == 2
 # define WRITEUCS4(c)                                           \
     REQUIRE_OUTBUF(2)                                           \
-    (*outbuf)[0] = 0xd800 + (((c) - 0x10000) >> 10);            \
-    (*outbuf)[1] = 0xdc00 + (((c) - 0x10000) & 0x3ff);          \
+    (*outbuf)[0] = Py_UNICODE_HIGH_SURROGATE(c);                \
+    (*outbuf)[1] = Py_UNICODE_LOW_SURROGATE(c);                 \
     NEXT_OUT(2)
 #else
 # define WRITEUCS4(c)                                           \
@@ -188,11 +188,10 @@ static const struct dbcs_map *mapping_list;
 
 #if Py_UNICODE_SIZE == 2
 #define DECODE_SURROGATE(c)                                     \
-    if (c >> 10 == 0xd800 >> 10) { /* high surrogate */         \
+    if (Py_UNICODE_IS_HIGH_SURROGATE(c)) {                      \
         REQUIRE_INBUF(2)                                        \
-        if (IN2 >> 10 == 0xdc00 >> 10) { /* low surrogate */ \
-            c = 0x10000 + ((ucs4_t)(c - 0xd800) << 10) + \
-            ((ucs4_t)(IN2) - 0xdc00);                           \
+        if (Py_UNICODE_IS_LOW_SURROGATE(IN2)) {                 \
+            c = Py_UNICODE_JOIN_SURROGATES(c, IN2)              \
         }                                                       \
     }
 #define GET_INSIZE(c)   ((c) > 0xffff ? 2 : 1)
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
@@ -4412,7 +4412,7 @@ _PyUnicode_EncodeUTF7(PyObject *str,
 
             /* code first surrogate */
             base64bits += 16;
-            base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
+            base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
             while (base64bits >= 6) {
                 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
                 base64bits -= 6;
@@ -7052,9 +7052,8 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes,
             charsize = 1;
         }
         else {
-            ch -= 0x10000;
-            chars[0] = 0xd800 + (ch >> 10);
-            chars[1] = 0xdc00 + (ch & 0x3ff);
+            chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
+            chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
             charsize = 2;
         }
 
diff --git a/Python/codecs.c b/Python/codecs.c
@@ -761,7 +761,7 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
         for (i = start; i < end; i++) {
             /* object is guaranteed to be "ready" */
             Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
-            if (ch < 0xd800 || ch > 0xdfff) {
+            if (!Py_UNICODE_IS_SURROGATE(ch)) {
                 /* Not a surrogate, fail with original exception */
                 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
                 Py_DECREF(res);
@@ -797,7 +797,7 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
              (p[2] & 0xc0) == 0x80)) {
             /* it's a three-byte code */
             ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
-            if (ch < 0xd800 || ch > 0xdfff)
+            if (!Py_UNICODE_IS_SURROGATE(ch))
                 /* it's not a surrogate - fail */
                 ch = 0;
         }
diff --git a/Python/fileutils.c b/Python/fileutils.c
@@ -85,7 +85,7 @@ _Py_char2wchar(const char* arg, size_t *size)
             /* Only use the result if it contains no
                surrogate characters. */
             for (tmp = res; *tmp != 0 &&
-                         (*tmp < 0xd800 || *tmp > 0xdfff); tmp++)
+                         !Py_UNICODE_IS_SURROGATE(*tmp); tmp++)
                 ;
             if (*tmp == 0) {
                 if (size != NULL)
@@ -131,7 +131,7 @@ _Py_char2wchar(const char* arg, size_t *size)
             memset(&mbs, 0, sizeof mbs);
             continue;
         }
-        if (*out >= 0xd800 && *out <= 0xdfff) {
+        if (Py_UNICODE_IS_SURROGATE(*out)) {
             /* Surrogate character.  Escape the original
                byte sequence with surrogateescape. */
             argsize -= converted;