Create _Py_wchar2char() function, reverse of _Py_char2wchar()

Victor Stinner · Victor Stinner · commit f2e08b34f1fa · 2010-08-13T23:29:08.000Z
* Use _Py_wchar2char() in _wstat() and _Py_wfopen()
 * Document _Py_char2wchar()
diff --git a/Include/Python.h b/Include/Python.h
@@ -126,12 +126,15 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
+
 /* _Py_Mangle is defined in compile.c */
 PyAPI_FUNC(PyObject*) _Py_Mangle(PyObject *p, PyObject *name);
 
 /* These functions live in main.c */
 PyAPI_FUNC(wchar_t *) _Py_char2wchar(char *);
+PyAPI_FUNC(char*) _Py_wchar2char(const wchar_t *text);
 PyAPI_FUNC(FILE *) _Py_wfopen(const wchar_t *path, const wchar_t *mode);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/Modules/getpath.c b/Modules/getpath.c
@@ -139,13 +139,16 @@ static wchar_t *lib_python = L"lib/python" VERSION;
 static int
 _wstat(const wchar_t* path, struct stat *buf)
 {
-    char fname[PATH_MAX];
-    size_t res = wcstombs(fname, path, sizeof(fname));
-    if (res == (size_t)-1) {
+    int err;
+    char *fname;
+    fname = _Py_wchar2char(path);
+    if (fname == NULL) {
         errno = EINVAL;
         return -1;
     }
-    return stat(fname, buf);
+    err = stat(fname, buf);
+    PyMem_Free(fname);
+    return err;
 }
 #endif
 
diff --git a/Modules/main.c b/Modules/main.c
@@ -105,20 +105,21 @@ FILE *
 _Py_wfopen(const wchar_t *path, const wchar_t *mode)
 {
 #ifndef MS_WINDOWS
-    char cpath[PATH_MAX];
+    FILE *f;
+    char *cpath;
     char cmode[10];
     size_t r;
-    r = wcstombs(cpath, path, PATH_MAX);
-    if (r == (size_t)-1 || r >= PATH_MAX) {
-        errno = EINVAL;
-        return NULL;
-    }
     r = wcstombs(cmode, mode, 10);
     if (r == (size_t)-1 || r >= 10) {
         errno = EINVAL;
         return NULL;
     }
-    return fopen(cpath, cmode);
+    cpath = _Py_wchar2char(path);
+    if (cpath == NULL)
+        return NULL;
+    f = fopen(cpath, cmode);
+    PyMem_Free(cpath);
+    return f;
 #else
     return _wfopen(path, mode);
 #endif
@@ -734,6 +735,85 @@ Py_GetArgcArgv(int *argc, wchar_t ***argv)
 }
 
 
+/* Encode a (wide) character string to the locale encoding with the
+   surrogateescape error handler (characters in range U+DC80..U+DCFF are
+   converted to bytes 0x80..0xFF).
+
+   This function is the reverse of _Py_char2wchar().
+
+   Return a pointer to a newly allocated byte string (use PyMem_Free() to free
+   the memory), or NULL on error (conversion error or memory error). */
+char*
+_Py_wchar2char(const wchar_t *text)
+{
+    const size_t len = wcslen(text);
+    char *result = NULL, *bytes = NULL;
+    size_t i, size, converted;
+    wchar_t c, buf[2];
+
+    /* The function works in two steps:
+       1. compute the length of the output buffer in bytes (size)
+       2. outputs the bytes */
+    size = 0;
+    buf[1] = 0;
+    while (1) {
+        for (i=0; i < len; i++) {
+            c = text[i];
+            if (c >= 0xdc80 && c <= 0xdcff) {
+                /* UTF-8b surrogate */
+                if (bytes != NULL) {
+                    *bytes++ = c - 0xdc00;
+                    size--;
+                }
+                else
+                    size++;
+                continue;
+            }
+            else {
+                buf[0] = c;
+                if (bytes != NULL)
+                    converted = wcstombs(bytes, buf, size);
+                else
+                    converted = wcstombs(NULL, buf, 0);
+                if (converted == (size_t)-1) {
+                    if (result != NULL)
+                        PyMem_Free(result);
+                    return NULL;
+                }
+                if (bytes != NULL) {
+                    bytes += converted;
+                    size -= converted;
+                }
+                else
+                    size += converted;
+            }
+        }
+        if (result != NULL) {
+            *bytes = 0;
+            break;
+        }
+
+        size += 1; /* nul byte at the end */
+        result = PyMem_Malloc(size);
+        if (result == NULL)
+            return NULL;
+        bytes = result;
+    }
+    return result;
+}
+
+
+/* Decode a byte string from the locale encoding with the
+   surrogateescape error handler (undecodable bytes are decoded as characters
+   in range U+DC80..U+DCFF). If a byte sequence can be decoded as a surrogate
+   character, escape the bytes using the surrogateescape error handler instead
+   of decoding them.
+
+   Use _Py_wchar2char() to encode the character string back to a byte string.
+
+   Return a pointer to a newly allocated (wide) character string (use
+   PyMem_Free() to free the memory), or NULL on error (conversion error or
+   memory error). */
 wchar_t*
 _Py_char2wchar(char* arg)
 {