Bug #2301: Don't try decoding the source code into the original

loewis · loewis · commit 259314622750 · 2008-03-17T20:43:42.000Z
encoding for syntax errors.
diff --git a/Lib/test/test_pep263.py b/Lib/test/test_pep263.py
@@ -23,6 +23,13 @@ def test_compilestring(self):
         exec(c, d)
         self.assertEqual(d['u'], '\xf3')
 
+    def test_issue2301(self):
+        try:
+            compile(b"# coding: cp932\nprint '\x94\x4e'", "dummy", "exec")
+        except SyntaxError as v:
+            self.assertEquals(v.text, "print '\u5e74'")
+        else:
+            self.fail()
 
 def test_main():
     test_support.run_unittest(PEP263Test)
diff --git a/Misc/NEWS b/Misc/NEWS
@@ -9,6 +9,12 @@ What's New in Python 3.0a4?
 
 *Release date: XX-XXX-2008*
 
+Core and Builtins
+-----------------
+
+- Bug #2301: Don't try decoding the source code into the original
+  encoding for syntax errors.
+
 Extension Modules
 -----------------
 
diff --git a/Parser/parsetok.c b/Parser/parsetok.c
@@ -213,21 +213,16 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
 			err_ret->error = E_EOF;
 		err_ret->lineno = tok->lineno;
 		if (tok->buf != NULL) {
-			char *text = NULL;
 			size_t len;
 			assert(tok->cur - tok->buf < INT_MAX);
 			err_ret->offset = (int)(tok->cur - tok->buf);
 			len = tok->inp - tok->buf;
-			text = PyTokenizer_RestoreEncoding(tok, len, &err_ret->offset);
-			if (text == NULL) {
-				text = (char *) PyObject_MALLOC(len + 1);
-				if (text != NULL) {
-					if (len > 0)
-						strncpy(text, tok->buf, len);
-					text[len] = '\0';
-				}
+			err_ret->text = (char *) PyObject_MALLOC(len + 1);
+			if (err_ret->text != NULL) {
+				if (len > 0)
+					strncpy(err_ret->text, tok->buf, len);
+				err_ret->text[len] = '\0';
 			}
-			err_ret->text = text;
 		}
 	} else if (tok->encoding != NULL) {
 		node* r = PyNode_New(encoding_decl);
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
@@ -1579,70 +1579,6 @@ PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
 	return result;
 }
 
-/* This function is only called from parsetok. However, it cannot live
-   there, as it must be empty for PGEN, and we can check for PGEN only
-   in this file. */
-
-#ifdef PGEN
-char*
-PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
-{
-	return NULL;
-}
-#else
-static PyObject *
-dec_utf8(const char *enc, const char *text, size_t len) {
-	PyObject *ret = NULL;
-	PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
-	if (unicode_text) {
-		ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
-		Py_DECREF(unicode_text);
-	}
-	if (!ret) {
-		PyErr_Clear();
-	}
-        else {
-		assert(PyString_Check(ret));
-	}
-	return ret;
-}
-
-char *
-PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
-{
-	char *text = NULL;
-	if (tok->encoding) {
-		/* convert source to original encondig */
-		PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
-		if (lineobj != NULL) {
-			int linelen = PyString_GET_SIZE(lineobj);
-			const char *line = PyString_AS_STRING(lineobj);
-			text = PyObject_MALLOC(linelen + 1);
-			if (text != NULL && line != NULL) {
-				if (linelen)
-					strncpy(text, line, linelen);
-				text[linelen] = '\0';
-			}
-			Py_DECREF(lineobj);
-
-			/* adjust error offset */
-			if (*offset > 1) {
-				PyObject *offsetobj = dec_utf8(tok->encoding,
-							       tok->buf,
-							       *offset-1);
-				if (offsetobj) {
-					*offset = 1 + Py_SIZE(offsetobj);
-					Py_DECREF(offsetobj);
-				}
-			}
-
-		}
-	}
-	return text;
-
-}
-#endif
-
 /* Get -*- encoding -*- from a Python file.
 
    PyTokenizer_FindEncoding returns NULL when it can't find the encoding in