From 7339989c371b6b64a2c2c86a366a0af6a2adab48 Mon Sep 17 00:00:00 2001 From: John Sloboda Date: Tue, 2 Jan 2024 22:28:21 -0500 Subject: [PATCH 01/22] fix issue gh-85287 --- Lib/encodings/idna.py | 109 ++++++++++++++++++++++------- Lib/encodings/punycode.py | 13 ++-- Lib/encodings/undefined.py | 10 +-- Lib/encodings/utf_16.py | 4 +- Lib/encodings/utf_32.py | 6 +- Lib/test/test_codecs.py | 30 ++++---- Lib/test/test_multibytecodec.py | 4 +- Modules/cjkcodecs/multibytecodec.c | 60 +++++++++++++--- 8 files changed, 166 insertions(+), 70 deletions(-) diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py index 5396047a7fb0b8..8a33c82c797b27 100644 --- a/Lib/encodings/idna.py +++ b/Lib/encodings/idna.py @@ -25,7 +25,7 @@ def nameprep(label): label = unicodedata.normalize("NFKC", label) # Prohibit - for c in label: + for i, c in enumerate(label): if stringprep.in_table_c12(c) or \ stringprep.in_table_c22(c) or \ stringprep.in_table_c3(c) or \ @@ -35,7 +35,7 @@ def nameprep(label): stringprep.in_table_c7(c) or \ stringprep.in_table_c8(c) or \ stringprep.in_table_c9(c): - raise UnicodeError("Invalid character %r" % c) + raise UnicodeDecodeError("idna", label.encode("ascii"), i, i+1, f"Invalid character {c!r}") # Check bidi RandAL = [stringprep.in_table_d1(x) for x in label] @@ -46,14 +46,18 @@ def nameprep(label): # This is table C.8, which was already checked # 2) If a string contains any RandALCat character, the string # MUST NOT contain any LCat character. - if any(stringprep.in_table_d2(x) for x in label): - raise UnicodeError("Violation of BIDI requirement 2") + for i, x in enumerate(label): + if stringprep.in_table_d2(x): + raise UnicodeDecodeError("idna", label.encode("ascii"), i, i+1, "Violation of BIDI requirement 2") # 3) If a string contains any RandALCat character, a # RandALCat character MUST be the first character of the # string, and a RandALCat character MUST be the last # character of the string. - if not RandAL[0] or not RandAL[-1]: - raise UnicodeError("Violation of BIDI requirement 3") + if not RandAL[0]: + raise UnicodeDecodeError("idna", label.encode("ascii"), 0, 1, "Violation of BIDI requirement 3") + if not RandAL[-1]: + s_label = label.encode("ascii") + raise UnicodeDecodeError("idna", s_label, len(s_label)-1, len(s_label), "Violation of BIDI requirement 3") return label @@ -61,14 +65,15 @@ def ToASCII(label): try: # Step 1: try ASCII label = label.encode("ascii") - except UnicodeError: + except UnicodeEncodeError: pass else: # Skip to step 3: UseSTD3ASCIIRules is false, so # Skip to step 8. if 0 < len(label) < 64: return label - raise UnicodeError("label empty or too long") + b_label = label.decode("ascii") + raise UnicodeEncodeError("idna", b_label, 0, len(b_label), "label empty or too long") # Step 2: nameprep label = nameprep(label) @@ -77,17 +82,18 @@ def ToASCII(label): # Step 4: try ASCII try: label = label.encode("ascii") - except UnicodeError: + except UnicodeEncodeError: pass else: # Skip to step 8. if 0 < len(label) < 64: return label - raise UnicodeError("label empty or too long") + b_label = label.decode("ascii") + raise UnicodeEncodeError("idna", b_label, 0, len(b_label), "label empty or too long") # Step 5: Check ACE prefix if label.startswith(sace_prefix): - raise UnicodeError("Label starts with ACE prefix") + raise UnicodeEncodeError("idna", label.decode("ascii"), 0, len(sace_prefix), "Label starts with ACE prefix") # Step 6: Encode with PUNYCODE label = label.encode("punycode") @@ -98,7 +104,8 @@ def ToASCII(label): # Step 8: Check size if 0 < len(label) < 64: return label - raise UnicodeError("label empty or too long") + b_label = label.decode("punycode") + raise UnicodeEncodeError("idna", b_label, 0, len(b_label), "label empty or too long") def ToUnicode(label): if len(label) > 1024: @@ -110,7 +117,8 @@ def ToUnicode(label): # per https://www.rfc-editor.org/rfc/rfc3454#section-3.1 while still # preventing us from wasting time decoding a big thing that'll just # hit the actual <= 63 length limit in Step 6. - raise UnicodeError("label way too long") + b_label = label.decode("ascii") + raise UnicodeEncodeError("idna", b_label, 0, len(b_label), "label way too long") # Step 1: Check for ASCII if isinstance(label, bytes): pure_ascii = True @@ -118,7 +126,7 @@ def ToUnicode(label): try: label = label.encode("ascii") pure_ascii = True - except UnicodeError: + except UnicodeEncodeError: pure_ascii = False if not pure_ascii: # Step 2: Perform nameprep @@ -126,8 +134,9 @@ def ToUnicode(label): # It doesn't say this, but apparently, it should be ASCII now try: label = label.encode("ascii") - except UnicodeError: - raise UnicodeError("Invalid character in IDN label") + except UnicodeEncodeError as exc: + b_label = label.decode("ascii") + raise UnicodeEncodeError("idna", b_label, exc.start, exc.end, "Invalid character in IDN label") # Step 3: Check for ACE prefix if not label.startswith(ace_prefix): return str(label, "ascii") @@ -144,7 +153,8 @@ def ToUnicode(label): # Step 7: Compare the result of step 6 with the one of step 3 # label2 will already be in lower case. if str(label, "ascii").lower() != str(label2, "ascii"): - raise UnicodeError("IDNA does not round-trip", label, label2) + b_label = label.decode("ascii") + raise UnicodeEncodeError("idna", b_label, 0, len(b_label), f"IDNA does not round-trip, '{label!r}' != '{label2!r}'") # Step 8: return the result of step 5 return result @@ -156,7 +166,7 @@ def encode(self, input, errors='strict'): if errors != 'strict': # IDNA is quite clear that implementations must be strict - raise UnicodeError("unsupported error handling "+errors) + raise UnicodeEncodeError("idna", input, 0, 0, "unsupported error handling "+errors) if not input: return b'', 0 @@ -168,11 +178,15 @@ def encode(self, input, errors='strict'): else: # ASCII name: fast path labels = result.split(b'.') + index = 0 for label in labels[:-1]: if not (0 < len(label) < 64): - raise UnicodeError("label empty or too long") + b_label = label.decode("ascii") + raise UnicodeEncodeError("idna", b_label, index, index+len(b_label), "label empty or too long") + index += len(label) + 1 if len(labels[-1]) >= 64: - raise UnicodeError("label too long") + b_label = label.decode("ascii") + raise UnicodeEncodeError("idna", b_label, len(input)-len(b_label), len(input), "label too long") return result, len(input) result = bytearray() @@ -186,13 +200,22 @@ def encode(self, input, errors='strict'): if result: # Join with U+002E result.extend(b'.') - result.extend(ToASCII(label)) + try: + result.extend(ToASCII(label)) + except UnicodeEncodeError as exc: + raise UnicodeEncodeError( + "idna", + input, + exc.start + len(result), + exc.start + len(result) + len(label), + exc.reason, + ) return bytes(result+trailing_dot), len(input) def decode(self, input, errors='strict'): if errors != 'strict': - raise UnicodeError("Unsupported error handling "+errors) + raise UnicodeDecodeError("idna", input, 0, 0, "Unsupported error handling "+errors) if not input: return "", 0 @@ -219,7 +242,19 @@ def decode(self, input, errors='strict'): result = [] for label in labels: - result.append(ToUnicode(label)) + try: + u_label = ToUnicode(label) + except UnicodeEncodeError as exc: + size = sum(len(x) for x in result) + raise UnicodeDecodeError( + "idna", + input, + size, + size + len(label), + exc.reason, + ) + else: + result.append(u_label) return ".".join(result)+trailing_dot, len(input) @@ -227,7 +262,7 @@ class IncrementalEncoder(codecs.BufferedIncrementalEncoder): def _buffer_encode(self, input, errors, final): if errors != 'strict': # IDNA is quite clear that implementations must be strict - raise UnicodeError("unsupported error handling "+errors) + raise UnicodeEncodeError("idna", input, 0, 0, "Unsupported error handling "+errors) if not input: return (b'', 0) @@ -251,7 +286,16 @@ def _buffer_encode(self, input, errors, final): # Join with U+002E result.extend(b'.') size += 1 - result.extend(ToASCII(label)) + try: + result.extend(ToASCII(label)) + except UnicodeEncodeError as exc: + raise UnicodeEncodeError( + "idna", + input, + exc.start + size, + exc.start + size + len(label), + exc.reason, + ) size += len(label) result += trailing_dot @@ -261,7 +305,7 @@ def _buffer_encode(self, input, errors, final): class IncrementalDecoder(codecs.BufferedIncrementalDecoder): def _buffer_decode(self, input, errors, final): if errors != 'strict': - raise UnicodeError("Unsupported error handling "+errors) + raise UnicodeDecodeError("idna", input, 0, 0, "Unsupported error handling "+errors) if not input: return ("", 0) @@ -288,7 +332,18 @@ def _buffer_decode(self, input, errors, final): result = [] size = 0 for label in labels: - result.append(ToUnicode(label)) + try: + u_label = ToUnicode(label) + except UnicodeEncodeError as exc: + raise UnicodeDecodeError( + "idna", + input, + size, + size + len(label), + exc.reason, + ) + else: + result.append(u_label) if size: size += 1 size += len(label) diff --git a/Lib/encodings/punycode.py b/Lib/encodings/punycode.py index 1c5726447077b1..fdf41239348326 100644 --- a/Lib/encodings/punycode.py +++ b/Lib/encodings/punycode.py @@ -134,7 +134,8 @@ def decode_generalized_number(extended, extpos, bias, errors): char = ord(extended[extpos]) except IndexError: if errors == "strict": - raise UnicodeError("incomplete punicode string") + b_extended = extended.encode("ascii") + raise UnicodeDecodeError("punycode", b_extended, extpos, extpos+1, "incomplete punycode string") return extpos + 1, None extpos += 1 if 0x41 <= char <= 0x5A: # A-Z @@ -142,8 +143,8 @@ def decode_generalized_number(extended, extpos, bias, errors): elif 0x30 <= char <= 0x39: digit = char - 22 # 0x30-26 elif errors == "strict": - raise UnicodeError("Invalid extended code point '%s'" - % extended[extpos-1]) + b_extended = extended.encode("ascii") + raise UnicodeDecodeError("punycode", b_extended, extpos-1, extpos, f"Invalid extended code point '{extended[extpos-1]}'") else: return extpos, None t = T(j, bias) @@ -171,7 +172,7 @@ def insertion_sort(base, extended, errors): char += pos // (len(base) + 1) if char > 0x10FFFF: if errors == "strict": - raise UnicodeError("Invalid character U+%x" % char) + raise UnicodeDecodeError("punycode", base, pos, newpos, f"Invalid character U+{char:x}") char = ord('?') pos = pos % (len(base) + 1) base = base[:pos] + chr(char) + base[pos:] @@ -203,7 +204,7 @@ def encode(self, input, errors='strict'): def decode(self, input, errors='strict'): if errors not in ('strict', 'replace', 'ignore'): - raise UnicodeError("Unsupported error handling "+errors) + raise UnicodeDecodeError("punycode", input, 0, 0, "Unsupported error handling "+errors) res = punycode_decode(input, errors) return res, len(input) @@ -214,7 +215,7 @@ def encode(self, input, final=False): class IncrementalDecoder(codecs.IncrementalDecoder): def decode(self, input, final=False): if self.errors not in ('strict', 'replace', 'ignore'): - raise UnicodeError("Unsupported error handling "+self.errors) + raise UnicodeDecodeError("punycode", input, 0, 0, "Unsupported error handling "+self.errors) return punycode_decode(input, self.errors) class StreamWriter(Codec,codecs.StreamWriter): diff --git a/Lib/encodings/undefined.py b/Lib/encodings/undefined.py index 4690288355c710..e889816ba0269b 100644 --- a/Lib/encodings/undefined.py +++ b/Lib/encodings/undefined.py @@ -1,6 +1,6 @@ """ Python 'undefined' Codec - This codec will always raise a ValueError exception when being + This codec will always raise a UnicodeError exception when being used. It is intended for use by the site.py file to switch off automatic string to Unicode coercion. @@ -16,18 +16,18 @@ class Codec(codecs.Codec): def encode(self,input,errors='strict'): - raise UnicodeError("undefined encoding") + raise UnicodeEncodeError("undefined", input, 0, len(input), "undefined encoding") def decode(self,input,errors='strict'): - raise UnicodeError("undefined encoding") + raise UnicodeDecodeError("undefined", input, 0, len(input), "undefined encoding") class IncrementalEncoder(codecs.IncrementalEncoder): def encode(self, input, final=False): - raise UnicodeError("undefined encoding") + raise UnicodeEncodeError("undefined", input, 0, len(input), "undefined encoding") class IncrementalDecoder(codecs.IncrementalDecoder): def decode(self, input, final=False): - raise UnicodeError("undefined encoding") + raise UnicodeDecodeError("undefined", input, 0, len(input), "undefined encoding") class StreamWriter(Codec,codecs.StreamWriter): pass diff --git a/Lib/encodings/utf_16.py b/Lib/encodings/utf_16.py index c61248242be8c7..d3b9980026666f 100644 --- a/Lib/encodings/utf_16.py +++ b/Lib/encodings/utf_16.py @@ -64,7 +64,7 @@ def _buffer_decode(self, input, errors, final): elif byteorder == 1: self.decoder = codecs.utf_16_be_decode elif consumed >= 2: - raise UnicodeError("UTF-16 stream does not start with BOM") + raise UnicodeDecodeError("utf-16", input, 0, 2, "Stream does not start with BOM") return (output, consumed) return self.decoder(input, self.errors, final) @@ -138,7 +138,7 @@ def decode(self, input, errors='strict'): elif byteorder == 1: self.decode = codecs.utf_16_be_decode elif consumed>=2: - raise UnicodeError("UTF-16 stream does not start with BOM") + raise UnicodeDecodeError("utf-16", input, 0, 2, "Stream does not start with BOM") return (object, consumed) ### encodings module API diff --git a/Lib/encodings/utf_32.py b/Lib/encodings/utf_32.py index cdf84d14129a62..1924bedbb74c68 100644 --- a/Lib/encodings/utf_32.py +++ b/Lib/encodings/utf_32.py @@ -59,7 +59,7 @@ def _buffer_decode(self, input, errors, final): elif byteorder == 1: self.decoder = codecs.utf_32_be_decode elif consumed >= 4: - raise UnicodeError("UTF-32 stream does not start with BOM") + raise UnicodeDecodeError("utf-32", input, 0, 4, "Stream does not start with BOM") return (output, consumed) return self.decoder(input, self.errors, final) @@ -132,8 +132,8 @@ def decode(self, input, errors='strict'): self.decode = codecs.utf_32_le_decode elif byteorder == 1: self.decode = codecs.utf_32_be_decode - elif consumed>=4: - raise UnicodeError("UTF-32 stream does not start with BOM") + elif consumed >= 4: + raise UnicodeDecodeError("utf-32", input, 0, 4, "Stream does not start with BOM") return (object, consumed) ### encodings module API diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index ff511a625a0194..032e1c2c69d8d9 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -482,11 +482,11 @@ def test_only_one_bom(self): def test_badbom(self): s = io.BytesIO(4*b"\xff") f = codecs.getreader(self.encoding)(s) - self.assertRaises(UnicodeError, f.read) + self.assertRaises(UnicodeDecodeError, f.read) s = io.BytesIO(8*b"\xff") f = codecs.getreader(self.encoding)(s) - self.assertRaises(UnicodeError, f.read) + self.assertRaises(UnicodeDecodeError, f.read) def test_partial(self): self.check_partial( @@ -666,11 +666,11 @@ def test_only_one_bom(self): def test_badbom(self): s = io.BytesIO(b"\xff\xff") f = codecs.getreader(self.encoding)(s) - self.assertRaises(UnicodeError, f.read) + self.assertRaises(UnicodeDecodeError, f.read) s = io.BytesIO(b"\xff\xff\xff\xff") f = codecs.getreader(self.encoding)(s) - self.assertRaises(UnicodeError, f.read) + self.assertRaises(UnicodeDecodeError, f.read) def test_partial(self): self.check_partial( @@ -1356,13 +1356,13 @@ def test_decode(self): def test_decode_invalid(self): testcases = [ - (b"xn--w&", "strict", UnicodeError()), + (b"xn--w&", "strict", UnicodeDecodeError("punycode", b"xn--w&", 0, 6, "")), (b"xn--w&", "ignore", "xn-"), ] for puny, errors, expected in testcases: with self.subTest(puny=puny, errors=errors): if isinstance(expected, Exception): - self.assertRaises(UnicodeError, puny.decode, "punycode", errors) + self.assertRaises(UnicodeDecodeError, puny.decode, "punycode", errors) else: self.assertEqual(puny.decode("punycode", errors), expected) @@ -1532,7 +1532,7 @@ def test_nameprep(self): orig = str(orig, "utf-8", "surrogatepass") if prepped is None: # Input contains prohibited characters - self.assertRaises(UnicodeError, nameprep, orig) + self.assertRaises(UnicodeEncodeError, nameprep, orig) else: prepped = str(prepped, "utf-8", "surrogatepass") try: @@ -1555,9 +1555,9 @@ def test_builtin_encode(self): self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.") def test_builtin_decode_length_limit(self): - with self.assertRaisesRegex(UnicodeError, "way too long"): + with self.assertRaisesRegex(UnicodeDecodeError, "way too long"): (b"xn--016c"+b"a"*1100).decode("idna") - with self.assertRaisesRegex(UnicodeError, "too long"): + with self.assertRaisesRegex(UnicodeDecodeError, "too long"): (b"xn--016c"+b"a"*70).decode("idna") def test_stream(self): @@ -1744,14 +1744,14 @@ def test_open(self): self.assertIsInstance(file, codecs.StreamReaderWriter) def test_undefined(self): - self.assertRaises(UnicodeError, codecs.encode, 'abc', 'undefined') - self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined') - self.assertRaises(UnicodeError, codecs.encode, '', 'undefined') - self.assertRaises(UnicodeError, codecs.decode, b'', 'undefined') + self.assertRaises(UnicodeEncodeError, codecs.encode, 'abc', 'undefined') + self.assertRaises(UnicodeDecodeError, codecs.decode, b'abc', 'undefined') + self.assertRaises(UnicodeEncodeError, codecs.encode, '', 'undefined') + self.assertRaises(UnicodeDecodeError, codecs.decode, b'', 'undefined') for errors in ('strict', 'ignore', 'replace', 'backslashreplace'): - self.assertRaises(UnicodeError, + self.assertRaises(UnicodeEncodeError, codecs.encode, 'abc', 'undefined', errors) - self.assertRaises(UnicodeError, + self.assertRaises(UnicodeDecodeError, codecs.decode, b'abc', 'undefined', errors) def test_file_closes_if_lookup_error_raised(self): diff --git a/Lib/test/test_multibytecodec.py b/Lib/test/test_multibytecodec.py index 6451df14696933..f5d483f981a0fe 100644 --- a/Lib/test/test_multibytecodec.py +++ b/Lib/test/test_multibytecodec.py @@ -193,7 +193,7 @@ def test_setstate_validates_input_size(self): b"\x00\x00\x00\x00\x00\x00\x00\x00" b"\x00\x00\x00\x00\x00\x00\x00\x00", 'little') - self.assertRaises(UnicodeError, encoder.setstate, pending_size_nine) + self.assertRaises(UnicodeEncodeError, encoder.setstate, pending_size_nine) def test_setstate_validates_input_bytes(self): encoder = codecs.getincrementalencoder('euc_jp')() @@ -303,7 +303,7 @@ def test_setstate_validates_input(self): self.assertRaises(TypeError, decoder.setstate, 123) self.assertRaises(TypeError, decoder.setstate, ("invalid", 0)) self.assertRaises(TypeError, decoder.setstate, (b"1234", "invalid")) - self.assertRaises(UnicodeError, decoder.setstate, (b"123456789", 0)) + self.assertRaises(UnicodeDecodeError, decoder.setstate, (b"123456789", 0)) class Test_StreamReader(unittest.TestCase): def test_bug1728403(self): diff --git a/Modules/cjkcodecs/multibytecodec.c b/Modules/cjkcodecs/multibytecodec.c index 5d3c16a98423ba..4d27f30ab2e283 100644 --- a/Modules/cjkcodecs/multibytecodec.c +++ b/Modules/cjkcodecs/multibytecodec.c @@ -776,6 +776,7 @@ encoder_encode_stateful(MultibyteStatefulEncoderContext *ctx, PyObject *inbuf = NULL; Py_ssize_t inpos, datalen; PyObject *origpending = NULL; + PyObject *excobj = NULL; if (PyUnicode_Check(unistr)) ucvt = NULL; @@ -825,8 +826,13 @@ encoder_encode_stateful(MultibyteStatefulEncoderContext *ctx, if (inpos < datalen) { if (datalen - inpos > MAXENCPENDING) { /* normal codecs can't reach here */ - PyErr_SetString(PyExc_UnicodeError, - "pending buffer overflow"); + excobj = PyObject_CallFunction(PyExc_UnicodeEncodeError, + "ssnns", + ctx->codec->encoding, + (const char*)PyUnicode_AsUTF8(inbuf), + inpos, inpos + datalen, + "pending buffer overflow"); + PyErr_SetObject(PyExc_UnicodeEncodeError, excobj); goto errorexit; } ctx->pending = PyUnicode_Substring(inbuf, inpos, datalen); @@ -845,6 +851,7 @@ encoder_encode_stateful(MultibyteStatefulEncoderContext *ctx, Py_XDECREF(ucvt); Py_XDECREF(origpending); Py_XDECREF(inbuf); + Py_XDECREF(excobj); return NULL; } @@ -853,16 +860,25 @@ decoder_append_pending(MultibyteStatefulDecoderContext *ctx, MultibyteDecodeBuffer *buf) { Py_ssize_t npendings; + PyObject *excobj = NULL; npendings = (Py_ssize_t)(buf->inbuf_end - buf->inbuf); if (npendings + ctx->pendingsize > MAXDECPENDING || npendings > PY_SSIZE_T_MAX - ctx->pendingsize) { - PyErr_SetString(PyExc_UnicodeError, "pending buffer overflow"); - return -1; + Py_ssize_t bufsize = (Py_ssize_t)(buf->inbuf_end - buf->inbuf_top); + excobj = PyUnicodeDecodeError_Create(ctx->codec->encoding, + (const char *)buf->inbuf_top, bufsize, + 0, bufsize, "pending buffer overflow"); + PyErr_SetObject(PyExc_UnicodeDecodeError, excobj); + goto errorexit; } memcpy(ctx->pending + ctx->pendingsize, buf->inbuf, npendings); ctx->pendingsize += npendings; return 0; + +errorexit: + Py_XDECREF(excobj); + return -1; } static int @@ -931,15 +947,22 @@ _multibytecodec_MultibyteIncrementalEncoder_getstate_impl(MultibyteIncrementalEn Py_ssize_t statesize; const char *pendingbuffer = NULL; Py_ssize_t pendingsize; + PyObject *excobj = NULL; if (self->pending != NULL) { pendingbuffer = PyUnicode_AsUTF8AndSize(self->pending, &pendingsize); if (pendingbuffer == NULL) { - return NULL; + goto errorexit; } if (pendingsize > MAXENCPENDING*4) { - PyErr_SetString(PyExc_UnicodeError, "pending buffer too large"); - return NULL; + excobj = PyObject_CallFunction(PyExc_UnicodeEncodeError, + "ssnns", + self->codec->encoding, + pendingbuffer, + 0, pendingsize, + "pending buffer too large"); + PyErr_SetObject(PyExc_UnicodeEncodeError, excobj); + goto errorexit; } statebytes[0] = (unsigned char)pendingsize; memcpy(statebytes + 1, pendingbuffer, pendingsize); @@ -955,6 +978,9 @@ _multibytecodec_MultibyteIncrementalEncoder_getstate_impl(MultibyteIncrementalEn return (PyObject *)_PyLong_FromByteArray(statebytes, statesize, 1 /* little-endian */ , 0 /* unsigned */ ); +errorexit: + Py_XDECREF(excobj); + return NULL; } /*[clinic input] @@ -970,6 +996,7 @@ _multibytecodec_MultibyteIncrementalEncoder_setstate_impl(MultibyteIncrementalEn { PyObject *pending = NULL; unsigned char statebytes[1 + MAXENCPENDING*4 + sizeof(self->state.c)]; + PyObject *excobj = NULL; if (_PyLong_AsByteArray(statelong, statebytes, sizeof(statebytes), 1 /* little-endian */ , @@ -978,8 +1005,14 @@ _multibytecodec_MultibyteIncrementalEncoder_setstate_impl(MultibyteIncrementalEn } if (statebytes[0] > MAXENCPENDING*4) { - PyErr_SetString(PyExc_UnicodeError, "pending buffer too large"); - return NULL; + excobj = PyObject_CallFunction(PyExc_UnicodeEncodeError, + "ssnns", + self->codec->encoding, + statebytes, + 0, sizeof(statebytes), + "pending buffer too large"); + PyErr_SetObject(PyExc_UnicodeEncodeError, excobj); + goto errorexit; } pending = PyUnicode_DecodeUTF8((const char *)statebytes+1, @@ -996,6 +1029,7 @@ _multibytecodec_MultibyteIncrementalEncoder_setstate_impl(MultibyteIncrementalEn errorexit: Py_XDECREF(pending); + Py_XDECREF(excobj); return NULL; } @@ -1246,6 +1280,7 @@ _multibytecodec_MultibyteIncrementalDecoder_setstate_impl(MultibyteIncrementalDe Py_ssize_t buffersize; const char *bufferstr; unsigned char statebytes[8]; + PyObject *excobj = NULL; if (!PyArg_ParseTuple(state, "SO!;setstate(): illegal state argument", &buffer, &PyLong_Type, &statelong)) @@ -1265,7 +1300,12 @@ _multibytecodec_MultibyteIncrementalDecoder_setstate_impl(MultibyteIncrementalDe } if (buffersize > MAXDECPENDING) { - PyErr_SetString(PyExc_UnicodeError, "pending buffer too large"); + excobj = PyUnicodeDecodeError_Create(self->codec->encoding, + (const char *)buffer, buffersize, + 0, buffersize, + "pending buffer too large"); + PyErr_SetObject(PyExc_UnicodeDecodeError, excobj); + Py_XDECREF(excobj); return NULL; } From e92d4146045b5c6c8bcd2c53b016607fea3af7cf Mon Sep 17 00:00:00 2001 From: John Sloboda Date: Tue, 2 Jan 2024 22:47:28 -0500 Subject: [PATCH 02/22] add news blurb --- .../next/Library/2024-01-02-22-47-12.gh-issue-85287.ZC5DLj.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2024-01-02-22-47-12.gh-issue-85287.ZC5DLj.rst diff --git a/Misc/NEWS.d/next/Library/2024-01-02-22-47-12.gh-issue-85287.ZC5DLj.rst b/Misc/NEWS.d/next/Library/2024-01-02-22-47-12.gh-issue-85287.ZC5DLj.rst new file mode 100644 index 00000000000000..e6d031fbc93e83 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-01-02-22-47-12.gh-issue-85287.ZC5DLj.rst @@ -0,0 +1,2 @@ +Changes Unicode codecs to return UnicodeEncodeError or UnicodeDecodeError, +rather than just UnicodeError. From 10e7cd04d50baa793962d2ae34c45304035f26b0 Mon Sep 17 00:00:00 2001 From: John Sloboda Date: Wed, 3 Jan 2024 01:21:53 -0500 Subject: [PATCH 03/22] add more lenient unicode error handling within the except blocks --- Lib/encodings/idna.py | 72 +++++++++++++++--------------- Lib/encodings/punycode.py | 30 +++++++++---- Modules/cjkcodecs/multibytecodec.c | 2 +- 3 files changed, 60 insertions(+), 44 deletions(-) diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py index 8a33c82c797b27..d7189247e19e29 100644 --- a/Lib/encodings/idna.py +++ b/Lib/encodings/idna.py @@ -35,7 +35,7 @@ def nameprep(label): stringprep.in_table_c7(c) or \ stringprep.in_table_c8(c) or \ stringprep.in_table_c9(c): - raise UnicodeDecodeError("idna", label.encode("ascii"), i, i+1, f"Invalid character {c!r}") + raise UnicodeEncodeError("idna", label, i, i+1, f"Invalid character {c!r}") # Check bidi RandAL = [stringprep.in_table_d1(x) for x in label] @@ -48,16 +48,17 @@ def nameprep(label): # MUST NOT contain any LCat character. for i, x in enumerate(label): if stringprep.in_table_d2(x): - raise UnicodeDecodeError("idna", label.encode("ascii"), i, i+1, "Violation of BIDI requirement 2") + raise UnicodeEncodeError("idna", label, i, i+1, "Violation of BIDI requirement 2") # 3) If a string contains any RandALCat character, a # RandALCat character MUST be the first character of the # string, and a RandALCat character MUST be the last # character of the string. if not RandAL[0]: - raise UnicodeDecodeError("idna", label.encode("ascii"), 0, 1, "Violation of BIDI requirement 3") + raise UnicodeEncodeError( + "idna", label, + 0, 1, "Violation of BIDI requirement 3") if not RandAL[-1]: - s_label = label.encode("ascii") - raise UnicodeDecodeError("idna", s_label, len(s_label)-1, len(s_label), "Violation of BIDI requirement 3") + raise UnicodeEncodeError("idna", label, len(label)-1, len(label), "Violation of BIDI requirement 3") return label @@ -72,8 +73,8 @@ def ToASCII(label): # Skip to step 8. if 0 < len(label) < 64: return label - b_label = label.decode("ascii") - raise UnicodeEncodeError("idna", b_label, 0, len(b_label), "label empty or too long") + label = label.decode("ascii", errors="backslashreplace") + raise UnicodeEncodeError("idna", label, 0, len(label), "label empty or too long") # Step 2: nameprep label = nameprep(label) @@ -88,12 +89,14 @@ def ToASCII(label): # Skip to step 8. if 0 < len(label) < 64: return label - b_label = label.decode("ascii") - raise UnicodeEncodeError("idna", b_label, 0, len(b_label), "label empty or too long") + label = label.decode("ascii", errors="backslashreplace") + raise UnicodeEncodeError("idna", label, 0, len(label), "label empty or too long") # Step 5: Check ACE prefix if label.startswith(sace_prefix): - raise UnicodeEncodeError("idna", label.decode("ascii"), 0, len(sace_prefix), "Label starts with ACE prefix") + raise UnicodeEncodeError( + "idna", label.decode("ascii", errors="backslashreplace"), + 0, len(sace_prefix), "Label starts with ACE prefix") # Step 6: Encode with PUNYCODE label = label.encode("punycode") @@ -104,8 +107,8 @@ def ToASCII(label): # Step 8: Check size if 0 < len(label) < 64: return label - b_label = label.decode("punycode") - raise UnicodeEncodeError("idna", b_label, 0, len(b_label), "label empty or too long") + label = label.decode("punycode", errors="replace") + raise UnicodeEncodeError("idna", label, 0, len(label), "label empty or too long") def ToUnicode(label): if len(label) > 1024: @@ -117,8 +120,9 @@ def ToUnicode(label): # per https://www.rfc-editor.org/rfc/rfc3454#section-3.1 while still # preventing us from wasting time decoding a big thing that'll just # hit the actual <= 63 length limit in Step 6. - b_label = label.decode("ascii") - raise UnicodeEncodeError("idna", b_label, 0, len(b_label), "label way too long") + if isinstance(label, bytes): + label = label.decode("utf-8", errors="backslashreplace") + raise UnicodeEncodeError("idna", label, 0, len(label), "label way too long") # Step 1: Check for ASCII if isinstance(label, bytes): pure_ascii = True @@ -135,8 +139,9 @@ def ToUnicode(label): try: label = label.encode("ascii") except UnicodeEncodeError as exc: - b_label = label.decode("ascii") - raise UnicodeEncodeError("idna", b_label, exc.start, exc.end, "Invalid character in IDN label") + if isinstance(label, bytes): + label = label.decode("utf-8", errors="backslashreplace") + raise UnicodeEncodeError("idna", label, exc.start, exc.end, "Invalid character in IDN label") # Step 3: Check for ACE prefix if not label.startswith(ace_prefix): return str(label, "ascii") @@ -153,8 +158,7 @@ def ToUnicode(label): # Step 7: Compare the result of step 6 with the one of step 3 # label2 will already be in lower case. if str(label, "ascii").lower() != str(label2, "ascii"): - b_label = label.decode("ascii") - raise UnicodeEncodeError("idna", b_label, 0, len(b_label), f"IDNA does not round-trip, '{label!r}' != '{label2!r}'") + raise UnicodeEncodeError("idna", label, 0, len(label), f"IDNA does not round-trip, '{label!r}' != '{label2!r}'") # Step 8: return the result of step 5 return result @@ -166,7 +170,7 @@ def encode(self, input, errors='strict'): if errors != 'strict': # IDNA is quite clear that implementations must be strict - raise UnicodeEncodeError("idna", input, 0, 0, "unsupported error handling "+errors) + raise UnicodeEncodeError("idna", input, 0, 1, f"unsupported error handling {errors}") if not input: return b'', 0 @@ -181,12 +185,10 @@ def encode(self, input, errors='strict'): index = 0 for label in labels[:-1]: if not (0 < len(label) < 64): - b_label = label.decode("ascii") - raise UnicodeEncodeError("idna", b_label, index, index+len(b_label), "label empty or too long") + raise UnicodeEncodeError("idna", input, index, index+len(label), "label empty or too long") index += len(label) + 1 if len(labels[-1]) >= 64: - b_label = label.decode("ascii") - raise UnicodeEncodeError("idna", b_label, len(input)-len(b_label), len(input), "label too long") + raise UnicodeEncodeError("idna", input, index, len(input), "label too long") return result, len(input) result = bytearray() @@ -206,8 +208,8 @@ def encode(self, input, errors='strict'): raise UnicodeEncodeError( "idna", input, - exc.start + len(result), - exc.start + len(result) + len(label), + len(result) + exc.start, + len(result) + exc.end, exc.reason, ) return bytes(result+trailing_dot), len(input) @@ -215,7 +217,7 @@ def encode(self, input, errors='strict'): def decode(self, input, errors='strict'): if errors != 'strict': - raise UnicodeDecodeError("idna", input, 0, 0, "Unsupported error handling "+errors) + raise UnicodeDecodeError("idna", input, 0, 1, f"Unsupported error handling {errors}") if not input: return "", 0 @@ -245,12 +247,12 @@ def decode(self, input, errors='strict'): try: u_label = ToUnicode(label) except UnicodeEncodeError as exc: - size = sum(len(x) for x in result) + len_result = sum(len(x) for x in result) + len(result) raise UnicodeDecodeError( "idna", input, - size, - size + len(label), + len_result + exc.start, + len_result + exc.end, exc.reason, ) else: @@ -262,7 +264,7 @@ class IncrementalEncoder(codecs.BufferedIncrementalEncoder): def _buffer_encode(self, input, errors, final): if errors != 'strict': # IDNA is quite clear that implementations must be strict - raise UnicodeEncodeError("idna", input, 0, 0, "Unsupported error handling "+errors) + raise UnicodeEncodeError("idna", input, 0, 1, f"Unsupported error handling {errors}") if not input: return (b'', 0) @@ -292,8 +294,8 @@ def _buffer_encode(self, input, errors, final): raise UnicodeEncodeError( "idna", input, - exc.start + size, - exc.start + size + len(label), + size + exc.start, + size + exc.end, exc.reason, ) size += len(label) @@ -305,7 +307,7 @@ def _buffer_encode(self, input, errors, final): class IncrementalDecoder(codecs.BufferedIncrementalDecoder): def _buffer_decode(self, input, errors, final): if errors != 'strict': - raise UnicodeDecodeError("idna", input, 0, 0, "Unsupported error handling "+errors) + raise UnicodeDecodeError("idna", input, 0, 1, "Unsupported error handling {errors}") if not input: return ("", 0) @@ -338,8 +340,8 @@ def _buffer_decode(self, input, errors, final): raise UnicodeDecodeError( "idna", input, - size, - size + len(label), + size + exc.start, + size + exc.end, exc.reason, ) else: diff --git a/Lib/encodings/punycode.py b/Lib/encodings/punycode.py index fdf41239348326..dda1341bac542a 100644 --- a/Lib/encodings/punycode.py +++ b/Lib/encodings/punycode.py @@ -1,4 +1,4 @@ -""" Codec for the Punicode encoding, as specified in RFC 3492 +""" Codec for the Punycode encoding, as specified in RFC 3492 Written by Martin v. Löwis. """ @@ -134,7 +134,7 @@ def decode_generalized_number(extended, extpos, bias, errors): char = ord(extended[extpos]) except IndexError: if errors == "strict": - b_extended = extended.encode("ascii") + b_extended = extended.encode("utf-8", errors="backslashreplace") raise UnicodeDecodeError("punycode", b_extended, extpos, extpos+1, "incomplete punycode string") return extpos + 1, None extpos += 1 @@ -143,7 +143,7 @@ def decode_generalized_number(extended, extpos, bias, errors): elif 0x30 <= char <= 0x39: digit = char - 22 # 0x30-26 elif errors == "strict": - b_extended = extended.encode("ascii") + b_extended = extended.encode("utf-8", errors="backslashreplace") raise UnicodeDecodeError("punycode", b_extended, extpos-1, extpos, f"Invalid extended code point '{extended[extpos-1]}'") else: return extpos, None @@ -162,8 +162,17 @@ def insertion_sort(base, extended, errors): bias = 72 extpos = 0 while extpos < len(extended): - newpos, delta = decode_generalized_number(extended, extpos, - bias, errors) + try: + newpos, delta = decode_generalized_number(extended, extpos, + bias, errors) + except UnicodeDecodeError as exc: + raise UnicodeDecodeError( + "punycode", + base.encode("utf-8", errors="backslashreplace") + + b"-" + + extended.encode("utf-8", errors="backslashreplace"), + pos + exc.start, pos + exc.end, exc.reason) + if delta is None: # There was an error in decoding. We can't continue because # synchronization is lost. @@ -172,7 +181,12 @@ def insertion_sort(base, extended, errors): char += pos // (len(base) + 1) if char > 0x10FFFF: if errors == "strict": - raise UnicodeDecodeError("punycode", base, pos, newpos, f"Invalid character U+{char:x}") + raise UnicodeDecodeError( + "punycode", + base.encode("utf-8", errors="backslashreplace") + + b"-" + + extended.encode("utf-8", errors="backslashreplace"), + pos, pos+1, f"Invalid character U+{char:x}") char = ord('?') pos = pos % (len(base) + 1) base = base[:pos] + chr(char) + base[pos:] @@ -204,7 +218,7 @@ def encode(self, input, errors='strict'): def decode(self, input, errors='strict'): if errors not in ('strict', 'replace', 'ignore'): - raise UnicodeDecodeError("punycode", input, 0, 0, "Unsupported error handling "+errors) + raise UnicodeDecodeError("punycode", input, 0, 1, f"Unsupported error handling {errors}") res = punycode_decode(input, errors) return res, len(input) @@ -215,7 +229,7 @@ def encode(self, input, final=False): class IncrementalDecoder(codecs.IncrementalDecoder): def decode(self, input, final=False): if self.errors not in ('strict', 'replace', 'ignore'): - raise UnicodeDecodeError("punycode", input, 0, 0, "Unsupported error handling "+self.errors) + raise UnicodeDecodeError("punycode", input, 0, 1, f"Unsupported error handling {self.errors}") return punycode_decode(input, self.errors) class StreamWriter(Codec,codecs.StreamWriter): diff --git a/Modules/cjkcodecs/multibytecodec.c b/Modules/cjkcodecs/multibytecodec.c index 4d27f30ab2e283..f851de9d22d079 100644 --- a/Modules/cjkcodecs/multibytecodec.c +++ b/Modules/cjkcodecs/multibytecodec.c @@ -829,7 +829,7 @@ encoder_encode_stateful(MultibyteStatefulEncoderContext *ctx, excobj = PyObject_CallFunction(PyExc_UnicodeEncodeError, "ssnns", ctx->codec->encoding, - (const char*)PyUnicode_AsUTF8(inbuf), + PyUnicode_AsUTF8(inbuf), inpos, inpos + datalen, "pending buffer overflow"); PyErr_SetObject(PyExc_UnicodeEncodeError, excobj); From 0122f9058055eed3961d0b53b471e540989ef269 Mon Sep 17 00:00:00 2001 From: John Sloboda Date: Wed, 3 Jan 2024 01:58:11 -0500 Subject: [PATCH 04/22] fix IDNA-specific length issue --- Lib/encodings/idna.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py index d7189247e19e29..30c760e29bb31a 100644 --- a/Lib/encodings/idna.py +++ b/Lib/encodings/idna.py @@ -107,7 +107,7 @@ def ToASCII(label): # Step 8: Check size if 0 < len(label) < 64: return label - label = label.decode("punycode", errors="replace") + label = label[len(ace_prefix):].decode("punycode", errors="replace") raise UnicodeEncodeError("idna", label, 0, len(label), "label empty or too long") def ToUnicode(label): From 63948d26164610786db54b183da94dc61e8e5022 Mon Sep 17 00:00:00 2001 From: John Sloboda Date: Wed, 3 Jan 2024 06:21:08 -0500 Subject: [PATCH 05/22] fix two issues --- Modules/cjkcodecs/multibytecodec.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Modules/cjkcodecs/multibytecodec.c b/Modules/cjkcodecs/multibytecodec.c index f851de9d22d079..7c6b6c383602b2 100644 --- a/Modules/cjkcodecs/multibytecodec.c +++ b/Modules/cjkcodecs/multibytecodec.c @@ -830,7 +830,7 @@ encoder_encode_stateful(MultibyteStatefulEncoderContext *ctx, "ssnns", ctx->codec->encoding, PyUnicode_AsUTF8(inbuf), - inpos, inpos + datalen, + inpos, datalen, "pending buffer overflow"); PyErr_SetObject(PyExc_UnicodeEncodeError, excobj); goto errorexit; @@ -1301,7 +1301,7 @@ _multibytecodec_MultibyteIncrementalDecoder_setstate_impl(MultibyteIncrementalDe if (buffersize > MAXDECPENDING) { excobj = PyUnicodeDecodeError_Create(self->codec->encoding, - (const char *)buffer, buffersize, + PyBytes_AS_STRING(buffer), buffersize, 0, buffersize, "pending buffer too large"); PyErr_SetObject(PyExc_UnicodeDecodeError, excobj); From 81310e3634d3ad2fdfbafc43d7d86e559fdd1452 Mon Sep 17 00:00:00 2001 From: John Sloboda Date: Fri, 5 Jan 2024 19:02:08 -0500 Subject: [PATCH 06/22] use plain UnicodeError for problems outside the en/decoded string --- Lib/encodings/idna.py | 8 ++++---- Lib/encodings/punycode.py | 4 ++-- Lib/encodings/undefined.py | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py index 30c760e29bb31a..a6f8a3a66e16ea 100644 --- a/Lib/encodings/idna.py +++ b/Lib/encodings/idna.py @@ -170,7 +170,7 @@ def encode(self, input, errors='strict'): if errors != 'strict': # IDNA is quite clear that implementations must be strict - raise UnicodeEncodeError("idna", input, 0, 1, f"unsupported error handling {errors}") + raise UnicodeError(f"Unsupported error handling: {errors}") if not input: return b'', 0 @@ -217,7 +217,7 @@ def encode(self, input, errors='strict'): def decode(self, input, errors='strict'): if errors != 'strict': - raise UnicodeDecodeError("idna", input, 0, 1, f"Unsupported error handling {errors}") + raise UnicodeError(f"Unsupported error handling: {errors}") if not input: return "", 0 @@ -264,7 +264,7 @@ class IncrementalEncoder(codecs.BufferedIncrementalEncoder): def _buffer_encode(self, input, errors, final): if errors != 'strict': # IDNA is quite clear that implementations must be strict - raise UnicodeEncodeError("idna", input, 0, 1, f"Unsupported error handling {errors}") + raise UnicodeError(f"Unsupported error handling: {errors}") if not input: return (b'', 0) @@ -307,7 +307,7 @@ def _buffer_encode(self, input, errors, final): class IncrementalDecoder(codecs.BufferedIncrementalDecoder): def _buffer_decode(self, input, errors, final): if errors != 'strict': - raise UnicodeDecodeError("idna", input, 0, 1, "Unsupported error handling {errors}") + raise UnicodeError("Unsupported error handling: {errors}") if not input: return ("", 0) diff --git a/Lib/encodings/punycode.py b/Lib/encodings/punycode.py index dda1341bac542a..c1f7a50e9d8072 100644 --- a/Lib/encodings/punycode.py +++ b/Lib/encodings/punycode.py @@ -218,7 +218,7 @@ def encode(self, input, errors='strict'): def decode(self, input, errors='strict'): if errors not in ('strict', 'replace', 'ignore'): - raise UnicodeDecodeError("punycode", input, 0, 1, f"Unsupported error handling {errors}") + raise UnicodeError(f"Unsupported error handling: {errors}") res = punycode_decode(input, errors) return res, len(input) @@ -229,7 +229,7 @@ def encode(self, input, final=False): class IncrementalDecoder(codecs.IncrementalDecoder): def decode(self, input, final=False): if self.errors not in ('strict', 'replace', 'ignore'): - raise UnicodeDecodeError("punycode", input, 0, 1, f"Unsupported error handling {self.errors}") + raise UnicodeError(f"Unsupported error handling: {self.errors}") return punycode_decode(input, self.errors) class StreamWriter(Codec,codecs.StreamWriter): diff --git a/Lib/encodings/undefined.py b/Lib/encodings/undefined.py index e889816ba0269b..082771e1c86677 100644 --- a/Lib/encodings/undefined.py +++ b/Lib/encodings/undefined.py @@ -16,18 +16,18 @@ class Codec(codecs.Codec): def encode(self,input,errors='strict'): - raise UnicodeEncodeError("undefined", input, 0, len(input), "undefined encoding") + raise UnicodeError("undefined encoding") def decode(self,input,errors='strict'): - raise UnicodeDecodeError("undefined", input, 0, len(input), "undefined encoding") + raise UnicodeError("undefined encoding") class IncrementalEncoder(codecs.IncrementalEncoder): def encode(self, input, final=False): - raise UnicodeEncodeError("undefined", input, 0, len(input), "undefined encoding") + raise UnicodeError("undefined encoding") class IncrementalDecoder(codecs.IncrementalDecoder): def decode(self, input, final=False): - raise UnicodeDecodeError("undefined", input, 0, len(input), "undefined encoding") + raise UnicodeError("undefined encoding") class StreamWriter(Codec,codecs.StreamWriter): pass From 367de4ef430bec6997ce4f4459b1e69c4534b22a Mon Sep 17 00:00:00 2001 From: John Sloboda Date: Fri, 5 Jan 2024 19:12:37 -0500 Subject: [PATCH 07/22] split label empty vs too long --- Lib/encodings/idna.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py index a6f8a3a66e16ea..de790a8ea6b46e 100644 --- a/Lib/encodings/idna.py +++ b/Lib/encodings/idna.py @@ -74,7 +74,10 @@ def ToASCII(label): if 0 < len(label) < 64: return label label = label.decode("ascii", errors="backslashreplace") - raise UnicodeEncodeError("idna", label, 0, len(label), "label empty or too long") + if len(label) == 0: + raise UnicodeEncodeError("idna", label, 0, 1, "label empty") + else: + raise UnicodeEncodeError("idna", label, 0, len(label), "label too long") # Step 2: nameprep label = nameprep(label) @@ -90,7 +93,10 @@ def ToASCII(label): if 0 < len(label) < 64: return label label = label.decode("ascii", errors="backslashreplace") - raise UnicodeEncodeError("idna", label, 0, len(label), "label empty or too long") + if len(label) == 0: + raise UnicodeEncodeError("idna", label, 0, 1, "label empty") + else: + raise UnicodeEncodeError("idna", label, 0, len(label), "label too long") # Step 5: Check ACE prefix if label.startswith(sace_prefix): @@ -108,7 +114,10 @@ def ToASCII(label): if 0 < len(label) < 64: return label label = label[len(ace_prefix):].decode("punycode", errors="replace") - raise UnicodeEncodeError("idna", label, 0, len(label), "label empty or too long") + if len(label) == 0: + raise UnicodeEncodeError("idna", label, 0, 1, "label empty") + else: + raise UnicodeEncodeError("idna", label, 0, len(label), "label too long") def ToUnicode(label): if len(label) > 1024: @@ -185,7 +194,10 @@ def encode(self, input, errors='strict'): index = 0 for label in labels[:-1]: if not (0 < len(label) < 64): - raise UnicodeEncodeError("idna", input, index, index+len(label), "label empty or too long") + if len(label) == 0: + raise UnicodeEncodeError("idna", input, index, index+1, "label empty") + elif len(label >= 64: + raise UnicodeEncodeError("idna", input, index, index+len(label), "label too long") index += len(label) + 1 if len(labels[-1]) >= 64: raise UnicodeEncodeError("idna", input, index, len(input), "label too long") From 9f575153c328bb7cf3fd664d095b5b9f5533d5ec Mon Sep 17 00:00:00 2001 From: John Sloboda Date: Fri, 5 Jan 2024 19:29:38 -0500 Subject: [PATCH 08/22] use labels input for finding error offset, not output result --- Lib/encodings/idna.py | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py index de790a8ea6b46e..26e2a9431c9174 100644 --- a/Lib/encodings/idna.py +++ b/Lib/encodings/idna.py @@ -191,16 +191,15 @@ def encode(self, input, errors='strict'): else: # ASCII name: fast path labels = result.split(b'.') - index = 0 - for label in labels[:-1]: - if not (0 < len(label) < 64): + for i, label in enumerate(labels[:-1]): if len(label) == 0: - raise UnicodeEncodeError("idna", input, index, index+1, "label empty") - elif len(label >= 64: - raise UnicodeEncodeError("idna", input, index, index+len(label), "label too long") - index += len(label) + 1 - if len(labels[-1]) >= 64: - raise UnicodeEncodeError("idna", input, index, len(input), "label too long") + offset = sum(len(l) for l in labels[:i]) + i + raise UnicodeEncodeError("idna", input, offset, offset+1, "label empty") + for i, label in enumerate(labels): + if len(label) >= 64: + offset = sum(len(l) for l in labels[:i]) + i + raise UnicodeEncodeError( + "idna", input, offset, offset+len(label), "label too long") return result, len(input) result = bytearray() @@ -210,18 +209,19 @@ def encode(self, input, errors='strict'): del labels[-1] else: trailing_dot = b'' - for label in labels: + for i, label in enumerate(labels): if result: # Join with U+002E result.extend(b'.') try: result.extend(ToASCII(label)) except UnicodeEncodeError as exc: + offset = sum(len(l) for l in labels[:i]) + i raise UnicodeEncodeError( "idna", input, - len(result) + exc.start, - len(result) + exc.end, + offset + exc.start, + offset + exc.end, exc.reason, ) return bytes(result+trailing_dot), len(input) @@ -259,14 +259,9 @@ def decode(self, input, errors='strict'): try: u_label = ToUnicode(label) except UnicodeEncodeError as exc: - len_result = sum(len(x) for x in result) + len(result) + offset = sum(len(x) for x in result) + len(result) raise UnicodeDecodeError( - "idna", - input, - len_result + exc.start, - len_result + exc.end, - exc.reason, - ) + "idna", input, offset+exc.start, offset+exc.end, exc.reason) else: result.append(u_label) From 389122d0e3a05cd76c9afa10c28c30fbd934fe50 Mon Sep 17 00:00:00 2001 From: John Sloboda Date: Fri, 5 Jan 2024 19:31:21 -0500 Subject: [PATCH 09/22] update test for undefined encoding --- Lib/test/test_codecs.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 032e1c2c69d8d9..7f3d45580c40b6 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -1744,14 +1744,14 @@ def test_open(self): self.assertIsInstance(file, codecs.StreamReaderWriter) def test_undefined(self): - self.assertRaises(UnicodeEncodeError, codecs.encode, 'abc', 'undefined') - self.assertRaises(UnicodeDecodeError, codecs.decode, b'abc', 'undefined') - self.assertRaises(UnicodeEncodeError, codecs.encode, '', 'undefined') - self.assertRaises(UnicodeDecodeError, codecs.decode, b'', 'undefined') + self.assertRaises(UnicodeError, codecs.encode, 'abc', 'undefined') + self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined') + self.assertRaises(UnicodeError, codecs.encode, '', 'undefined') + self.assertRaises(UnicodeError, codecs.decode, b'', 'undefined') for errors in ('strict', 'ignore', 'replace', 'backslashreplace'): - self.assertRaises(UnicodeEncodeError, + self.assertRaises(UnicodeError, codecs.encode, 'abc', 'undefined', errors) - self.assertRaises(UnicodeDecodeError, + self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined', errors) def test_file_closes_if_lookup_error_raised(self): From fe47caa7747e52dc5641bc5034cb5cc60c38ecd9 Mon Sep 17 00:00:00 2001 From: John Sloboda Date: Fri, 5 Jan 2024 19:46:44 -0500 Subject: [PATCH 10/22] fixed linebreaks on some of the longer exceptions --- Lib/encodings/idna.py | 24 ++++++++++++++---------- Lib/encodings/punycode.py | 6 ++++-- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py index 26e2a9431c9174..3b87007317de37 100644 --- a/Lib/encodings/idna.py +++ b/Lib/encodings/idna.py @@ -48,17 +48,18 @@ def nameprep(label): # MUST NOT contain any LCat character. for i, x in enumerate(label): if stringprep.in_table_d2(x): - raise UnicodeEncodeError("idna", label, i, i+1, "Violation of BIDI requirement 2") + raise UnicodeEncodeError("idna", label, i, i+1, + "Violation of BIDI requirement 2") # 3) If a string contains any RandALCat character, a # RandALCat character MUST be the first character of the # string, and a RandALCat character MUST be the last # character of the string. if not RandAL[0]: - raise UnicodeEncodeError( - "idna", label, - 0, 1, "Violation of BIDI requirement 3") + raise UnicodeEncodeError("idna", label, 0, 1, + "Violation of BIDI requirement 3") if not RandAL[-1]: - raise UnicodeEncodeError("idna", label, len(label)-1, len(label), "Violation of BIDI requirement 3") + raise UnicodeEncodeError("idna", label, len(label)-1, len(label), + "Violation of BIDI requirement 3") return label @@ -150,7 +151,8 @@ def ToUnicode(label): except UnicodeEncodeError as exc: if isinstance(label, bytes): label = label.decode("utf-8", errors="backslashreplace") - raise UnicodeEncodeError("idna", label, exc.start, exc.end, "Invalid character in IDN label") + raise UnicodeEncodeError("idna", label, exc.start, exc.end, + "Invalid character in IDN label") # Step 3: Check for ACE prefix if not label.startswith(ace_prefix): return str(label, "ascii") @@ -167,7 +169,8 @@ def ToUnicode(label): # Step 7: Compare the result of step 6 with the one of step 3 # label2 will already be in lower case. if str(label, "ascii").lower() != str(label2, "ascii"): - raise UnicodeEncodeError("idna", label, 0, len(label), f"IDNA does not round-trip, '{label!r}' != '{label2!r}'") + raise UnicodeEncodeError("idna", label, 0, len(label), + f"IDNA does not round-trip, '{label!r}' != '{label2!r}'") # Step 8: return the result of step 5 return result @@ -194,12 +197,13 @@ def encode(self, input, errors='strict'): for i, label in enumerate(labels[:-1]): if len(label) == 0: offset = sum(len(l) for l in labels[:i]) + i - raise UnicodeEncodeError("idna", input, offset, offset+1, "label empty") + raise UnicodeEncodeError("idna", input, offset, offset+1, + "label empty") for i, label in enumerate(labels): if len(label) >= 64: offset = sum(len(l) for l in labels[:i]) + i - raise UnicodeEncodeError( - "idna", input, offset, offset+len(label), "label too long") + raise UnicodeEncodeError("idna", input, offset, offset+len(label), + "label too long") return result, len(input) result = bytearray() diff --git a/Lib/encodings/punycode.py b/Lib/encodings/punycode.py index c1f7a50e9d8072..e5e5a7ed608766 100644 --- a/Lib/encodings/punycode.py +++ b/Lib/encodings/punycode.py @@ -135,7 +135,8 @@ def decode_generalized_number(extended, extpos, bias, errors): except IndexError: if errors == "strict": b_extended = extended.encode("utf-8", errors="backslashreplace") - raise UnicodeDecodeError("punycode", b_extended, extpos, extpos+1, "incomplete punycode string") + raise UnicodeDecodeError("punycode", b_extended, extpos, extpos+1, + "incomplete punycode string") return extpos + 1, None extpos += 1 if 0x41 <= char <= 0x5A: # A-Z @@ -144,7 +145,8 @@ def decode_generalized_number(extended, extpos, bias, errors): digit = char - 22 # 0x30-26 elif errors == "strict": b_extended = extended.encode("utf-8", errors="backslashreplace") - raise UnicodeDecodeError("punycode", b_extended, extpos-1, extpos, f"Invalid extended code point '{extended[extpos-1]}'") + raise UnicodeDecodeError("punycode", b_extended, extpos-1, extpos, + f"Invalid extended code point '{extended[extpos-1]}'") else: return extpos, None t = T(j, bias) From 95cb5bbbdd3cb6c55ee9ebe568777e531c2dffa1 Mon Sep 17 00:00:00 2001 From: John Sloboda Date: Sun, 7 Jan 2024 16:46:50 -0500 Subject: [PATCH 11/22] add tests for unicode error offsets, and tighten up the logic for calculating them --- Lib/encodings/idna.py | 29 ++++++---- Lib/encodings/punycode.py | 21 ++++---- Lib/test/test_codecs.py | 109 +++++++++++++++++++++++++++++++++++++- 3 files changed, 138 insertions(+), 21 deletions(-) diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py index 3b87007317de37..2ae5c7572c44f5 100644 --- a/Lib/encodings/idna.py +++ b/Lib/encodings/idna.py @@ -148,7 +148,7 @@ def ToUnicode(label): # It doesn't say this, but apparently, it should be ASCII now try: label = label.encode("ascii") - except UnicodeEncodeError as exc: + except (UnicodeEncodeError, UnicodeDecodeError) as exc: if isinstance(label, bytes): label = label.decode("utf-8", errors="backslashreplace") raise UnicodeEncodeError("idna", label, exc.start, exc.end, @@ -161,7 +161,12 @@ def ToUnicode(label): label1 = label[len(ace_prefix):] # Step 5: Decode using PUNYCODE - result = label1.decode("punycode") + try: + result = label1.decode("punycode") + except (UnicodeEncodeError, UnicodeDecodeError) as exc: + offset = len(ace_prefix) + raise UnicodeEncodeError("idna", label.decode("utf-8", errors="backslashreplace"), + offset+exc.start, offset+exc.end, exc.reason) # Step 6: Apply ToASCII label2 = ToASCII(result) @@ -219,7 +224,7 @@ def encode(self, input, errors='strict'): result.extend(b'.') try: result.extend(ToASCII(label)) - except UnicodeEncodeError as exc: + except (UnicodeEncodeError, UnicodeDecodeError) as exc: offset = sum(len(l) for l in labels[:i]) + i raise UnicodeEncodeError( "idna", @@ -259,11 +264,11 @@ def decode(self, input, errors='strict'): trailing_dot = '' result = [] - for label in labels: + for i, label in enumerate(labels): try: u_label = ToUnicode(label) - except UnicodeEncodeError as exc: - offset = sum(len(x) for x in result) + len(result) + except (UnicodeEncodeError, UnicodeDecodeError) as exc: + offset = sum(len(x) for x in labels[:i]) + len(labels[:i]) raise UnicodeDecodeError( "idna", input, offset+exc.start, offset+exc.end, exc.reason) else: @@ -301,7 +306,7 @@ def _buffer_encode(self, input, errors, final): size += 1 try: result.extend(ToASCII(label)) - except UnicodeEncodeError as exc: + except (UnicodeEncodeError, UnicodeDecodeError) as exc: raise UnicodeEncodeError( "idna", input, @@ -328,7 +333,11 @@ def _buffer_decode(self, input, errors, final): labels = dots.split(input) else: # Must be ASCII string - input = str(input, "ascii") + try: + input = str(input, "ascii") + except (UnicodeEncodeError, UnicodeDecodeError) as exc: + raise UnicodeDecodeError("idna", input, + exc.start, exc.end, exc.reason) labels = input.split(".") trailing_dot = '' @@ -347,10 +356,10 @@ def _buffer_decode(self, input, errors, final): for label in labels: try: u_label = ToUnicode(label) - except UnicodeEncodeError as exc: + except (UnicodeEncodeError, UnicodeDecodeError) as exc: raise UnicodeDecodeError( "idna", - input, + input.encode("ascii", errors="backslashreplace"), size + exc.start, size + exc.end, exc.reason, diff --git a/Lib/encodings/punycode.py b/Lib/encodings/punycode.py index e5e5a7ed608766..62d53f5df9f3bf 100644 --- a/Lib/encodings/punycode.py +++ b/Lib/encodings/punycode.py @@ -158,11 +158,13 @@ def decode_generalized_number(extended, extpos, bias, errors): def insertion_sort(base, extended, errors): - """3.2 Insertion unsort coding""" + """3.2 Insertion sort coding""" char = 0x80 pos = -1 bias = 72 extpos = 0 + original_base, original_ext = base, extended + extended_offset = (len(original_base) + 1) if original_base else 0 while extpos < len(extended): try: newpos, delta = decode_generalized_number(extended, extpos, @@ -170,10 +172,10 @@ def insertion_sort(base, extended, errors): except UnicodeDecodeError as exc: raise UnicodeDecodeError( "punycode", - base.encode("utf-8", errors="backslashreplace") - + b"-" - + extended.encode("utf-8", errors="backslashreplace"), - pos + exc.start, pos + exc.end, exc.reason) + original_base.encode("utf-8", errors="backslashreplace") + + (b"-" if original_base else b"") + + original_ext.encode("utf-8", errors="backslashreplace"), + extended_offset+exc.start, extended_offset+exc.end, exc.reason) if delta is None: # There was an error in decoding. We can't continue because @@ -185,10 +187,11 @@ def insertion_sort(base, extended, errors): if errors == "strict": raise UnicodeDecodeError( "punycode", - base.encode("utf-8", errors="backslashreplace") - + b"-" - + extended.encode("utf-8", errors="backslashreplace"), - pos, pos+1, f"Invalid character U+{char:x}") + original_base.encode("utf-8", errors="backslashreplace") + + (b"-" if original_base else b"") + + original_ext.encode("utf-8", errors="backslashreplace"), + extended_offset+pos-1, extended_offset+pos, + f"Invalid character U+{char:x}") char = ord('?') pos = pos % (len(base) + 1) base = base[:pos] + chr(char) + base[pos:] diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 7f3d45580c40b6..d067ae4fe90b89 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -1356,13 +1356,29 @@ def test_decode(self): def test_decode_invalid(self): testcases = [ - (b"xn--w&", "strict", UnicodeDecodeError("punycode", b"xn--w&", 0, 6, "")), + (b"xn--w&", "strict", UnicodeDecodeError("punycode", b"xn--W&", 5, 6, "")), + (b"&egbpdaj6bu4bxfgehfvwxn", "strict", UnicodeDecodeError("punycode", b"&EGBPDAJ6BU4BXFGEHFVWXN", 0, 1, "")), + (b"egbpdaj6bu&4bx&fgehfvwxn", "strict", UnicodeDecodeError("punycode", b"EGBPDAJ6BU&4BX&FGEHFVWXN", 10, 11, "")), + (b"egbpdaj6bu4bxfgehfvwxn&", "strict", UnicodeDecodeError("punycode", b"EGBPDAJ6BU4BXFGEHFVWXN&", 22, 23, "")), + (b"\xFFProprostnemluvesky-uyb24dma41a", "strict", UnicodeDecodeError("ascii", b"\xFFProprostnemluvesky", 0, 1, "")), + (b"Pro\xFFprostnemluvesky-uyb24dma41a", "strict", UnicodeDecodeError("ascii", b"Pro\xFFprostnemluvesky", 3, 4, "")), + (b"Proprost&nemluvesky-uyb24&dma41a", "strict", UnicodeDecodeError("punycode", b"Proprost&nemluvesky-UYB24&DMA41A", 25, 26, "")), + (b"Proprostnemluvesky&-&uyb24dma41a", "strict", UnicodeDecodeError("punycode", b"Proprostnemluvesky&-&UYB24DMA41A", 20, 21, "")), + (b"Proprostnemluvesky-&uyb24dma41a", "strict", UnicodeDecodeError("punycode", b"Proprostnemluvesky-&UYB24DMA41A", 19, 20, "")), + (b"Proprostnemluvesky-uyb24d&ma41a", "strict", UnicodeDecodeError("punycode", b"Proprostnemluvesky-UYB24D&MA41A", 25, 26, "")), + (b"Proprostnemluvesky-uyb24dma41a&", "strict", UnicodeDecodeError("punycode", b"Proprostnemluvesky-UYB24DMA41A&", 30, 31, "")), (b"xn--w&", "ignore", "xn-"), ] for puny, errors, expected in testcases: with self.subTest(puny=puny, errors=errors): if isinstance(expected, Exception): - self.assertRaises(UnicodeDecodeError, puny.decode, "punycode", errors) + with self.assertRaises(UnicodeDecodeError) as cm: + puny.decode("punycode", errors) + exc = cm.exception + self.assertEqual(exc.encoding, expected.encoding) + self.assertEqual(exc.object, expected.object) + self.assertEqual(exc.start, expected.start) + self.assertEqual(exc.end, expected.end) else: self.assertEqual(puny.decode("punycode", errors), expected) @@ -1542,18 +1558,57 @@ def test_nameprep(self): class IDNACodecTest(unittest.TestCase): + + invalid_decode_testcases = [ + (b"\xFFpython.org", UnicodeDecodeError("idna", b"\xFFpython.org", 0, 1, "")), + (b"pyt\xFFhon.org", UnicodeDecodeError("idna", b"pyt\xFFhon.org", 3, 4, "")), + (b"python\xFF.org", UnicodeDecodeError("idna", b"python\xFF.org", 6, 7, "")), + (b"python.\xFForg", UnicodeDecodeError("idna", b"python.\xFForg", 7, 8, "")), + (b"python.o\xFFrg", UnicodeDecodeError("idna", b"python.o\xFFrg", 8, 9, "")), + (b"python.org\xFF", UnicodeDecodeError("idna", b"python.org\xFF", 10, 11, "")), + (b"xn--pythn-&mua.org", UnicodeDecodeError("idna", b"xn--pythn-&mua.org", 10, 11, "")), + (b"xn--pythn-m&ua.org", UnicodeDecodeError("idna", b"xn--pythn-m&ua.org", 11, 12, "")), + (b"xn--pythn-mua&.org", UnicodeDecodeError("idna", b"xn--pythn-mua&.org", 13, 14, "")), + ] + invalid_encode_testcases = [ + (f"foo.{'\xff'*60}", UnicodeEncodeError("idna", f"foo.{'\xff'*60}", 4, 64, "")), + ("あさ.\u034f", UnicodeEncodeError("idna", "あさ.\u034f", 3, 4, "")), + ] + def test_builtin_decode(self): self.assertEqual(str(b"python.org", "idna"), "python.org") self.assertEqual(str(b"python.org.", "idna"), "python.org.") self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org") self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.") + def test_builtin_decode_invalid(self): + for case, expected in self.invalid_decode_testcases: + with self.subTest(case=case, expected=expected): + with self.assertRaises(UnicodeDecodeError) as cm: + case.decode("idna") + exc = cm.exception + self.assertEqual(exc.encoding, expected.encoding) + self.assertEqual(exc.object, expected.object) + self.assertEqual(exc.start, expected.start, msg=f'reason: {exc.reason}') + self.assertEqual(exc.end, expected.end) + def test_builtin_encode(self): self.assertEqual("python.org".encode("idna"), b"python.org") self.assertEqual("python.org.".encode("idna"), b"python.org.") self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org") self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.") + def test_builtin_encode_invalid(self): + for case, expected in self.invalid_encode_testcases: + with self.subTest(case=case, expected=expected): + with self.assertRaises(UnicodeEncodeError) as cm: + case.encode("idna") + exc = cm.exception + self.assertEqual(exc.encoding, expected.encoding) + self.assertEqual(exc.object, expected.object) + self.assertEqual(exc.start, expected.start) + self.assertEqual(exc.end, expected.end) + def test_builtin_decode_length_limit(self): with self.assertRaisesRegex(UnicodeDecodeError, "way too long"): (b"xn--016c"+b"a"*1100).decode("idna") @@ -1595,6 +1650,39 @@ def test_incremental_decode(self): self.assertEqual(decoder.decode(b"rg."), "org.") self.assertEqual(decoder.decode(b"", True), "") + def test_incremental_decode_invalid(self): + iterdecode_testcases = [ + (b"\xFFpython.org", UnicodeDecodeError("idna", b"\xFF", 0, 1, "")), + (b"pyt\xFFhon.org", UnicodeDecodeError("idna", b"pyt\xFF", 3, 4, "")), + (b"python\xFF.org", UnicodeDecodeError("idna", b"python\xFF", 6, 7, "")), + (b"python.\xFForg", UnicodeDecodeError("idna", b"\xFF", 0, 1, "")), + (b"python.o\xFFrg", UnicodeDecodeError("idna", b"o\xFF", 1, 2, "")), + (b"python.org\xFF", UnicodeDecodeError("idna", b"org\xFF", 3, 4, "")), + (b"xn--pythn-&mua.org", UnicodeDecodeError("idna", b"xn--pythn-&mua.", 10, 11, "")), + (b"xn--pythn-m&ua.org", UnicodeDecodeError("idna", b"xn--pythn-m&ua.", 11, 12, "")), + (b"xn--pythn-mua&.org", UnicodeDecodeError("idna", b"xn--pythn-mua&.", 13, 14, "")), + ] + for case, expected in iterdecode_testcases: + with self.subTest(case=case, expected=expected): + with self.assertRaises(UnicodeDecodeError) as cm: + list(codecs.iterdecode((bytes([c]) for c in case), "idna")) + exc = cm.exception + self.assertEqual(exc.encoding, expected.encoding) + self.assertEqual(exc.object, expected.object) + self.assertEqual(exc.start, expected.start) + self.assertEqual(exc.end, expected.end) + + decoder = codecs.getincrementaldecoder("idna")() + for case, expected in self.invalid_decode_testcases: + with self.subTest(case=case, expected=expected): + with self.assertRaises(UnicodeDecodeError) as cm: + decoder.decode(case) + exc = cm.exception + self.assertEqual(exc.encoding, expected.encoding) + self.assertEqual(exc.object, expected.object) + self.assertEqual(exc.start, expected.start) + self.assertEqual(exc.end, expected.end) + def test_incremental_encode(self): self.assertEqual( b"".join(codecs.iterencode("python.org", "idna")), @@ -1623,6 +1711,23 @@ def test_incremental_encode(self): self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.") self.assertEqual(encoder.encode("", True), b"") + def test_incremental_encode_invalid(self): + iterencode_testcases = [ + (f"foo.{'\xff'*60}", UnicodeEncodeError("idna", f"{'\xff'*60}", 0, 60, "")), + ("あさ.\u034f", UnicodeEncodeError("idna", "\u034f", 0, 1, "")), + ] + for case, expected in iterencode_testcases: + with self.subTest(case=case, expected=expected): + with self.assertRaises(UnicodeEncodeError) as cm: + list(codecs.iterencode(case, "idna")) + exc = cm.exception + self.assertEqual(exc.encoding, expected.encoding) + self.assertEqual(exc.object, expected.object) + self.assertEqual(exc.start, expected.start) + self.assertEqual(exc.end, expected.end) + + # codecs.getincrementalencoder.encode() does not throw an error + def test_errors(self): """Only supports "strict" error handler""" "python.org".encode("idna", "strict") From aefd7c228b65c5199b285eb47e3346ef88ffbcea Mon Sep 17 00:00:00 2001 From: John Sloboda Date: Fri, 16 Feb 2024 12:50:26 -0500 Subject: [PATCH 12/22] reduce scope of exception object, and fail gracefully if it cannot be created --- Modules/cjkcodecs/multibytecodec.c | 45 ++++++++++++++++-------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/Modules/cjkcodecs/multibytecodec.c b/Modules/cjkcodecs/multibytecodec.c index 98aca221d42c8c..1f311938fc130f 100644 --- a/Modules/cjkcodecs/multibytecodec.c +++ b/Modules/cjkcodecs/multibytecodec.c @@ -776,7 +776,6 @@ encoder_encode_stateful(MultibyteStatefulEncoderContext *ctx, PyObject *inbuf = NULL; Py_ssize_t inpos, datalen; PyObject *origpending = NULL; - PyObject *excobj = NULL; if (PyUnicode_Check(unistr)) ucvt = NULL; @@ -826,13 +825,15 @@ encoder_encode_stateful(MultibyteStatefulEncoderContext *ctx, if (inpos < datalen) { if (datalen - inpos > MAXENCPENDING) { /* normal codecs can't reach here */ - excobj = PyObject_CallFunction(PyExc_UnicodeEncodeError, - "ssnns", - ctx->codec->encoding, - PyUnicode_AsUTF8(inbuf), - inpos, datalen, - "pending buffer overflow"); + PyObject *excobj = PyObject_CallFunction(PyExc_UnicodeEncodeError, + "ssnns", + ctx->codec->encoding, + PyUnicode_AsUTF8(inbuf), + inpos, datalen, + "pending buffer overflow"); + if (excobj == NULL) goto errorexit; PyErr_SetObject(PyExc_UnicodeEncodeError, excobj); + Py_DECREF(excobj); goto errorexit; } ctx->pending = PyUnicode_Substring(inbuf, inpos, datalen); @@ -851,7 +852,6 @@ encoder_encode_stateful(MultibyteStatefulEncoderContext *ctx, Py_XDECREF(ucvt); Py_XDECREF(origpending); Py_XDECREF(inbuf); - Py_XDECREF(excobj); return NULL; } @@ -860,16 +860,20 @@ decoder_append_pending(MultibyteStatefulDecoderContext *ctx, MultibyteDecodeBuffer *buf) { Py_ssize_t npendings; - PyObject *excobj = NULL; npendings = (Py_ssize_t)(buf->inbuf_end - buf->inbuf); if (npendings + ctx->pendingsize > MAXDECPENDING || npendings > PY_SSIZE_T_MAX - ctx->pendingsize) { Py_ssize_t bufsize = (Py_ssize_t)(buf->inbuf_end - buf->inbuf_top); - excobj = PyUnicodeDecodeError_Create(ctx->codec->encoding, - (const char *)buf->inbuf_top, bufsize, - 0, bufsize, "pending buffer overflow"); + PyObject *excobj = PyUnicodeDecodeError_Create(ctx->codec->encoding, + (const char *)buf->inbuf_top, + bufsize, + 0, + bufsize, + "pending buffer overflow"); + if (excobj == NULL) goto errorexit; PyErr_SetObject(PyExc_UnicodeDecodeError, excobj); + Py_DECREF(excobj); goto errorexit; } memcpy(ctx->pending + ctx->pendingsize, buf->inbuf, npendings); @@ -877,7 +881,6 @@ decoder_append_pending(MultibyteStatefulDecoderContext *ctx, return 0; errorexit: - Py_XDECREF(excobj); return -1; } @@ -947,7 +950,6 @@ _multibytecodec_MultibyteIncrementalEncoder_getstate_impl(MultibyteIncrementalEn Py_ssize_t statesize; const char *pendingbuffer = NULL; Py_ssize_t pendingsize; - PyObject *excobj = NULL; if (self->pending != NULL) { pendingbuffer = PyUnicode_AsUTF8AndSize(self->pending, &pendingsize); @@ -955,13 +957,15 @@ _multibytecodec_MultibyteIncrementalEncoder_getstate_impl(MultibyteIncrementalEn goto errorexit; } if (pendingsize > MAXENCPENDING*4) { - excobj = PyObject_CallFunction(PyExc_UnicodeEncodeError, - "ssnns", - self->codec->encoding, - pendingbuffer, - 0, pendingsize, - "pending buffer too large"); + PyObject *excobj = PyObject_CallFunction(PyExc_UnicodeEncodeError, + "ssnns", + self->codec->encoding, + pendingbuffer, + 0, pendingsize, + "pending buffer too large"); + if (excobj == NULL) goto errorexit; PyErr_SetObject(PyExc_UnicodeEncodeError, excobj); + Py_DECREF(excobj); goto errorexit; } statebytes[0] = (unsigned char)pendingsize; @@ -979,7 +983,6 @@ _multibytecodec_MultibyteIncrementalEncoder_getstate_impl(MultibyteIncrementalEn 1 /* little-endian */ , 0 /* unsigned */ ); errorexit: - Py_XDECREF(excobj); return NULL; } From f73ccfe736e8d102e132d6e1d2b1b168b1bbbfc6 Mon Sep 17 00:00:00 2001 From: John Sloboda Date: Fri, 16 Feb 2024 13:01:51 -0500 Subject: [PATCH 13/22] use object formatting on inbuf directly in exc --- Modules/cjkcodecs/multibytecodec.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Modules/cjkcodecs/multibytecodec.c b/Modules/cjkcodecs/multibytecodec.c index 1f311938fc130f..b75ae8b28777ad 100644 --- a/Modules/cjkcodecs/multibytecodec.c +++ b/Modules/cjkcodecs/multibytecodec.c @@ -826,9 +826,9 @@ encoder_encode_stateful(MultibyteStatefulEncoderContext *ctx, if (datalen - inpos > MAXENCPENDING) { /* normal codecs can't reach here */ PyObject *excobj = PyObject_CallFunction(PyExc_UnicodeEncodeError, - "ssnns", + "sOnns", ctx->codec->encoding, - PyUnicode_AsUTF8(inbuf), + inbuf, inpos, datalen, "pending buffer overflow"); if (excobj == NULL) goto errorexit; From e0747b4e77fd50b0996a0e938988c638e86906de Mon Sep 17 00:00:00 2001 From: John Sloboda Date: Fri, 16 Feb 2024 13:09:12 -0500 Subject: [PATCH 14/22] reduce scope of exception object, and fail gracefully if it cannot be created, but for pre-existing code --- Modules/cjkcodecs/multibytecodec.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/Modules/cjkcodecs/multibytecodec.c b/Modules/cjkcodecs/multibytecodec.c index b75ae8b28777ad..589b93a0bbe493 100644 --- a/Modules/cjkcodecs/multibytecodec.c +++ b/Modules/cjkcodecs/multibytecodec.c @@ -999,7 +999,6 @@ _multibytecodec_MultibyteIncrementalEncoder_setstate_impl(MultibyteIncrementalEn { PyObject *pending = NULL; unsigned char statebytes[1 + MAXENCPENDING*4 + sizeof(self->state.c)]; - PyObject *excobj = NULL; if (_PyLong_AsByteArray(statelong, statebytes, sizeof(statebytes), 1 /* little-endian */ , @@ -1009,13 +1008,15 @@ _multibytecodec_MultibyteIncrementalEncoder_setstate_impl(MultibyteIncrementalEn } if (statebytes[0] > MAXENCPENDING*4) { - excobj = PyObject_CallFunction(PyExc_UnicodeEncodeError, - "ssnns", - self->codec->encoding, - statebytes, - 0, sizeof(statebytes), - "pending buffer too large"); + PyObject *excobj = PyObject_CallFunction(PyExc_UnicodeEncodeError, + "ssnns", + self->codec->encoding, + statebytes, + 0, sizeof(statebytes), + "pending buffer too large"); + if (excobj == NULL) goto errorexit; PyErr_SetObject(PyExc_UnicodeEncodeError, excobj); + Py_DECREF(excobj); goto errorexit; } @@ -1033,7 +1034,6 @@ _multibytecodec_MultibyteIncrementalEncoder_setstate_impl(MultibyteIncrementalEn errorexit: Py_XDECREF(pending); - Py_XDECREF(excobj); return NULL; } @@ -1284,7 +1284,6 @@ _multibytecodec_MultibyteIncrementalDecoder_setstate_impl(MultibyteIncrementalDe Py_ssize_t buffersize; const char *bufferstr; unsigned char statebytes[8]; - PyObject *excobj = NULL; if (!PyArg_ParseTuple(state, "SO!;setstate(): illegal state argument", &buffer, &PyLong_Type, &statelong)) @@ -1305,12 +1304,13 @@ _multibytecodec_MultibyteIncrementalDecoder_setstate_impl(MultibyteIncrementalDe } if (buffersize > MAXDECPENDING) { - excobj = PyUnicodeDecodeError_Create(self->codec->encoding, - PyBytes_AS_STRING(buffer), buffersize, - 0, buffersize, - "pending buffer too large"); + PyObject *excobj = PyUnicodeDecodeError_Create(self->codec->encoding, + PyBytes_AS_STRING(buffer), buffersize, + 0, buffersize, + "pending buffer too large"); + if (excobj == NULL) return NULL; PyErr_SetObject(PyExc_UnicodeDecodeError, excobj); - Py_XDECREF(excobj); + Py_DECREF(excobj); return NULL; } From 93e99ae2e249002821e98a57f7b7df4e15ce75f8 Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Wed, 21 Feb 2024 18:00:08 +0900 Subject: [PATCH 15/22] update MultibyteIncrementalEncoder.getstate() --- Modules/cjkcodecs/multibytecodec.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/Modules/cjkcodecs/multibytecodec.c b/Modules/cjkcodecs/multibytecodec.c index 589b93a0bbe493..96b21119286301 100644 --- a/Modules/cjkcodecs/multibytecodec.c +++ b/Modules/cjkcodecs/multibytecodec.c @@ -874,14 +874,11 @@ decoder_append_pending(MultibyteStatefulDecoderContext *ctx, if (excobj == NULL) goto errorexit; PyErr_SetObject(PyExc_UnicodeDecodeError, excobj); Py_DECREF(excobj); - goto errorexit; + return -1; } memcpy(ctx->pending + ctx->pendingsize, buf->inbuf, npendings); ctx->pendingsize += npendings; return 0; - -errorexit: - return -1; } static int @@ -954,19 +951,21 @@ _multibytecodec_MultibyteIncrementalEncoder_getstate_impl(MultibyteIncrementalEn if (self->pending != NULL) { pendingbuffer = PyUnicode_AsUTF8AndSize(self->pending, &pendingsize); if (pendingbuffer == NULL) { - goto errorexit; + return NULL; } if (pendingsize > MAXENCPENDING*4) { PyObject *excobj = PyObject_CallFunction(PyExc_UnicodeEncodeError, - "ssnns", + "sOnns", self->codec->encoding, - pendingbuffer, - 0, pendingsize, + self->pending, + 0, PyUnicode_GET_LENGTH(self->pending), "pending buffer too large"); - if (excobj == NULL) goto errorexit; + if (excobj == NULL) { + return NULL; + } PyErr_SetObject(PyExc_UnicodeEncodeError, excobj); Py_DECREF(excobj); - goto errorexit; + return NULL; } statebytes[0] = (unsigned char)pendingsize; memcpy(statebytes + 1, pendingbuffer, pendingsize); @@ -982,8 +981,6 @@ _multibytecodec_MultibyteIncrementalEncoder_getstate_impl(MultibyteIncrementalEn return (PyObject *)_PyLong_FromByteArray(statebytes, statesize, 1 /* little-endian */ , 0 /* unsigned */ ); -errorexit: - return NULL; } /*[clinic input] From 87e1f991f5fd0e945ec36fcf37ce0c7ee1646e4e Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Wed, 21 Feb 2024 18:12:11 +0900 Subject: [PATCH 16/22] fixup --- Modules/cjkcodecs/multibytecodec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Modules/cjkcodecs/multibytecodec.c b/Modules/cjkcodecs/multibytecodec.c index 96b21119286301..ae7b389f297ea6 100644 --- a/Modules/cjkcodecs/multibytecodec.c +++ b/Modules/cjkcodecs/multibytecodec.c @@ -871,7 +871,7 @@ decoder_append_pending(MultibyteStatefulDecoderContext *ctx, 0, bufsize, "pending buffer overflow"); - if (excobj == NULL) goto errorexit; + if (excobj == NULL) return -1; PyErr_SetObject(PyExc_UnicodeDecodeError, excobj); Py_DECREF(excobj); return -1; From 0728a43a420f3dbf9b99633048ed7dfed0900902 Mon Sep 17 00:00:00 2001 From: John Sloboda Date: Wed, 21 Feb 2024 23:28:37 -0500 Subject: [PATCH 17/22] change buffer size issue error back to UnicodeError --- Modules/cjkcodecs/multibytecodec.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/Modules/cjkcodecs/multibytecodec.c b/Modules/cjkcodecs/multibytecodec.c index ae7b389f297ea6..6a1512f9f40fe5 100644 --- a/Modules/cjkcodecs/multibytecodec.c +++ b/Modules/cjkcodecs/multibytecodec.c @@ -1005,15 +1005,7 @@ _multibytecodec_MultibyteIncrementalEncoder_setstate_impl(MultibyteIncrementalEn } if (statebytes[0] > MAXENCPENDING*4) { - PyObject *excobj = PyObject_CallFunction(PyExc_UnicodeEncodeError, - "ssnns", - self->codec->encoding, - statebytes, - 0, sizeof(statebytes), - "pending buffer too large"); - if (excobj == NULL) goto errorexit; - PyErr_SetObject(PyExc_UnicodeEncodeError, excobj); - Py_DECREF(excobj); + PyErr_SetString(PyExc_UnicodeError, "pending buffer too large"); goto errorexit; } From 1cc911d20b926532ee8376d9824804cc36ccf55c Mon Sep 17 00:00:00 2001 From: John Sloboda Date: Thu, 22 Feb 2024 13:05:15 -0500 Subject: [PATCH 18/22] update test to match changed exception --- Lib/test/test_multibytecodec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_multibytecodec.py b/Lib/test/test_multibytecodec.py index f5d483f981a0fe..ccdf3a6cdc0dc7 100644 --- a/Lib/test/test_multibytecodec.py +++ b/Lib/test/test_multibytecodec.py @@ -193,7 +193,7 @@ def test_setstate_validates_input_size(self): b"\x00\x00\x00\x00\x00\x00\x00\x00" b"\x00\x00\x00\x00\x00\x00\x00\x00", 'little') - self.assertRaises(UnicodeEncodeError, encoder.setstate, pending_size_nine) + self.assertRaises(UnicodeError, encoder.setstate, pending_size_nine) def test_setstate_validates_input_bytes(self): encoder = codecs.getincrementalencoder('euc_jp')() From 9594baef5d4d0b4e55693699e75dc24142eb2a9f Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Fri, 23 Feb 2024 16:14:49 +0900 Subject: [PATCH 19/22] Update Modules/cjkcodecs/multibytecodec.c --- Modules/cjkcodecs/multibytecodec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Modules/cjkcodecs/multibytecodec.c b/Modules/cjkcodecs/multibytecodec.c index 6a1512f9f40fe5..e5433d7dd85306 100644 --- a/Modules/cjkcodecs/multibytecodec.c +++ b/Modules/cjkcodecs/multibytecodec.c @@ -1006,7 +1006,7 @@ _multibytecodec_MultibyteIncrementalEncoder_setstate_impl(MultibyteIncrementalEn if (statebytes[0] > MAXENCPENDING*4) { PyErr_SetString(PyExc_UnicodeError, "pending buffer too large"); - goto errorexit; + return NULL; } pending = PyUnicode_DecodeUTF8((const char *)statebytes+1, From ea3ff8ac064ad02672b6180c4f83febe942a4773 Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Fri, 23 Feb 2024 17:59:44 +0900 Subject: [PATCH 20/22] improve idna codec errors --- Lib/encodings/idna.py | 53 +++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 30 deletions(-) diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py index 2ae5c7572c44f5..2907117fe49adb 100644 --- a/Lib/encodings/idna.py +++ b/Lib/encodings/idna.py @@ -11,7 +11,7 @@ sace_prefix = "xn--" # This assumes query strings, so AllowUnassigned is true -def nameprep(label): +def nameprep(label): # type: (str) -> str # Map newlabel = [] for c in label: @@ -63,18 +63,17 @@ def nameprep(label): return label -def ToASCII(label): +def ToASCII(label): # type: (str) -> bytes try: # Step 1: try ASCII - label = label.encode("ascii") + label_ascii = label.encode("ascii") except UnicodeEncodeError: pass else: # Skip to step 3: UseSTD3ASCIIRules is false, so # Skip to step 8. - if 0 < len(label) < 64: - return label - label = label.decode("ascii", errors="backslashreplace") + if 0 < len(label_ascii) < 64: + return label_ascii if len(label) == 0: raise UnicodeEncodeError("idna", label, 0, 1, "label empty") else: @@ -86,14 +85,13 @@ def ToASCII(label): # Step 3: UseSTD3ASCIIRules is false # Step 4: try ASCII try: - label = label.encode("ascii") + label_ascii = label.encode("ascii") except UnicodeEncodeError: pass else: # Skip to step 8. if 0 < len(label) < 64: - return label - label = label.decode("ascii", errors="backslashreplace") + return label_ascii if len(label) == 0: raise UnicodeEncodeError("idna", label, 0, 1, "label empty") else: @@ -102,23 +100,19 @@ def ToASCII(label): # Step 5: Check ACE prefix if label.startswith(sace_prefix): raise UnicodeEncodeError( - "idna", label.decode("ascii", errors="backslashreplace"), - 0, len(sace_prefix), "Label starts with ACE prefix") + "idna", label, 0, len(sace_prefix), "Label starts with ACE prefix") # Step 6: Encode with PUNYCODE - label = label.encode("punycode") + label_ascii = label.encode("punycode") # Step 7: Prepend ACE prefix - label = ace_prefix + label + label_ascii = ace_prefix + label_ascii # Step 8: Check size - if 0 < len(label) < 64: - return label - label = label[len(ace_prefix):].decode("punycode", errors="replace") - if len(label) == 0: - raise UnicodeEncodeError("idna", label, 0, 1, "label empty") - else: - raise UnicodeEncodeError("idna", label, 0, len(label), "label too long") + # do not check for empty as we prepend ace_prefix. + if len(label_ascii) < 64: + return label_ascii + raise UnicodeEncodeError("idna", label, 0, len(label), "label too long") def ToUnicode(label): if len(label) > 1024: @@ -130,9 +124,9 @@ def ToUnicode(label): # per https://www.rfc-editor.org/rfc/rfc3454#section-3.1 while still # preventing us from wasting time decoding a big thing that'll just # hit the actual <= 63 length limit in Step 6. - if isinstance(label, bytes): - label = label.decode("utf-8", errors="backslashreplace") - raise UnicodeEncodeError("idna", label, 0, len(label), "label way too long") + if isinstance(label, str): + label = label.encode("utf-8", errors="backslashreplace") + raise UnicodeDecodeError("idna", label, 0, len(label), "label way too long") # Step 1: Check for ASCII if isinstance(label, bytes): pure_ascii = True @@ -143,17 +137,17 @@ def ToUnicode(label): except UnicodeEncodeError: pure_ascii = False if not pure_ascii: + assert isinstance(label, str) # Step 2: Perform nameprep label = nameprep(label) # It doesn't say this, but apparently, it should be ASCII now try: label = label.encode("ascii") - except (UnicodeEncodeError, UnicodeDecodeError) as exc: - if isinstance(label, bytes): - label = label.decode("utf-8", errors="backslashreplace") + except UnicodeEncodeError as exc: raise UnicodeEncodeError("idna", label, exc.start, exc.end, "Invalid character in IDN label") # Step 3: Check for ACE prefix + assert isinstance(label, bytes) if not label.startswith(ace_prefix): return str(label, "ascii") @@ -163,10 +157,9 @@ def ToUnicode(label): # Step 5: Decode using PUNYCODE try: result = label1.decode("punycode") - except (UnicodeEncodeError, UnicodeDecodeError) as exc: + except UnicodeDecodeError as exc: offset = len(ace_prefix) - raise UnicodeEncodeError("idna", label.decode("utf-8", errors="backslashreplace"), - offset+exc.start, offset+exc.end, exc.reason) + raise UnicodeDecodeError("idna", label, offset+exc.start, offset+exc.end, exc.reason) # Step 6: Apply ToASCII label2 = ToASCII(result) @@ -174,7 +167,7 @@ def ToUnicode(label): # Step 7: Compare the result of step 6 with the one of step 3 # label2 will already be in lower case. if str(label, "ascii").lower() != str(label2, "ascii"): - raise UnicodeEncodeError("idna", label, 0, len(label), + raise UnicodeDecodeError("idna", label, 0, len(label), f"IDNA does not round-trip, '{label!r}' != '{label2!r}'") # Step 8: return the result of step 5 From 8a2bc500423fd2961b684703aaecd901ca734ec5 Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Fri, 23 Feb 2024 18:39:45 +0900 Subject: [PATCH 21/22] improve punycode.decode() --- Lib/encodings/punycode.py | 41 +++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 23 deletions(-) diff --git a/Lib/encodings/punycode.py b/Lib/encodings/punycode.py index 62d53f5df9f3bf..d6242e6e9e639e 100644 --- a/Lib/encodings/punycode.py +++ b/Lib/encodings/punycode.py @@ -131,11 +131,10 @@ def decode_generalized_number(extended, extpos, bias, errors): j = 0 while 1: try: - char = ord(extended[extpos]) + char = extended[extpos] except IndexError: if errors == "strict": - b_extended = extended.encode("utf-8", errors="backslashreplace") - raise UnicodeDecodeError("punycode", b_extended, extpos, extpos+1, + raise UnicodeDecodeError("punycode", extended, extpos, extpos+1, "incomplete punycode string") return extpos + 1, None extpos += 1 @@ -144,8 +143,7 @@ def decode_generalized_number(extended, extpos, bias, errors): elif 0x30 <= char <= 0x39: digit = char - 22 # 0x30-26 elif errors == "strict": - b_extended = extended.encode("utf-8", errors="backslashreplace") - raise UnicodeDecodeError("punycode", b_extended, extpos-1, extpos, + raise UnicodeDecodeError("punycode", extended, extpos-1, extpos, f"Invalid extended code point '{extended[extpos-1]}'") else: return extpos, None @@ -163,8 +161,9 @@ def insertion_sort(base, extended, errors): pos = -1 bias = 72 extpos = 0 - original_base, original_ext = base, extended - extended_offset = (len(original_base) + 1) if original_base else 0 + extended_offset = (len(base) + 1) if base else 0 + result = base.decode('ascii', errors) + while extpos < len(extended): try: newpos, delta = decode_generalized_number(extended, extpos, @@ -172,32 +171,28 @@ def insertion_sort(base, extended, errors): except UnicodeDecodeError as exc: raise UnicodeDecodeError( "punycode", - original_base.encode("utf-8", errors="backslashreplace") - + (b"-" if original_base else b"") - + original_ext.encode("utf-8", errors="backslashreplace"), + base + (b"-" if base else b"") + extended, extended_offset+exc.start, extended_offset+exc.end, exc.reason) if delta is None: # There was an error in decoding. We can't continue because # synchronization is lost. - return base + return result pos += delta+1 - char += pos // (len(base) + 1) + char += pos // (len(result) + 1) if char > 0x10FFFF: if errors == "strict": raise UnicodeDecodeError( "punycode", - original_base.encode("utf-8", errors="backslashreplace") - + (b"-" if original_base else b"") - + original_ext.encode("utf-8", errors="backslashreplace"), + base + (b"-" if base else b"") + extended, extended_offset+pos-1, extended_offset+pos, f"Invalid character U+{char:x}") char = ord('?') - pos = pos % (len(base) + 1) - base = base[:pos] + chr(char) + base[pos:] - bias = adapt(delta, (extpos == 0), len(base)) + pos = pos % (len(result) + 1) + result = result[:pos] + chr(char) + result[pos:] + bias = adapt(delta, (extpos == 0), len(result)) extpos = newpos - return base + return result def punycode_decode(text, errors): if isinstance(text, str): @@ -206,11 +201,11 @@ def punycode_decode(text, errors): text = bytes(text) pos = text.rfind(b"-") if pos == -1: - base = "" - extended = str(text, "ascii").upper() + base = b"" + extended = text.upper() else: - base = str(text[:pos], "ascii", errors) - extended = str(text[pos+1:], "ascii").upper() + base = text[:pos] + extended = text[pos+1:].upper() return insertion_sort(base, extended, errors) ### Codec APIs From a63e17a751e09ea3589a75c4e15ace16fe2ae4b8 Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Fri, 23 Feb 2024 19:21:35 +0900 Subject: [PATCH 22/22] improve punycode_decode again --- Lib/encodings/punycode.py | 47 ++++++++++++++++++++------------------- Lib/test/test_codecs.py | 24 ++++++++++---------- 2 files changed, 36 insertions(+), 35 deletions(-) diff --git a/Lib/encodings/punycode.py b/Lib/encodings/punycode.py index d6242e6e9e639e..4622fc8c9206f3 100644 --- a/Lib/encodings/punycode.py +++ b/Lib/encodings/punycode.py @@ -157,42 +157,33 @@ def decode_generalized_number(extended, extpos, bias, errors): def insertion_sort(base, extended, errors): """3.2 Insertion sort coding""" + # This function raises UnicodeDecodeError with position in the extended. + # Caller should add the offset. char = 0x80 pos = -1 bias = 72 extpos = 0 - extended_offset = (len(base) + 1) if base else 0 - result = base.decode('ascii', errors) while extpos < len(extended): - try: - newpos, delta = decode_generalized_number(extended, extpos, - bias, errors) - except UnicodeDecodeError as exc: - raise UnicodeDecodeError( - "punycode", - base + (b"-" if base else b"") + extended, - extended_offset+exc.start, extended_offset+exc.end, exc.reason) - + newpos, delta = decode_generalized_number(extended, extpos, + bias, errors) if delta is None: # There was an error in decoding. We can't continue because # synchronization is lost. - return result + return base pos += delta+1 - char += pos // (len(result) + 1) + char += pos // (len(base) + 1) if char > 0x10FFFF: if errors == "strict": raise UnicodeDecodeError( - "punycode", - base + (b"-" if base else b"") + extended, - extended_offset+pos-1, extended_offset+pos, + "punycode", extended, pos-1, pos, f"Invalid character U+{char:x}") char = ord('?') - pos = pos % (len(result) + 1) - result = result[:pos] + chr(char) + result[pos:] - bias = adapt(delta, (extpos == 0), len(result)) + pos = pos % (len(base) + 1) + base = base[:pos] + chr(char) + base[pos:] + bias = adapt(delta, (extpos == 0), len(base)) extpos = newpos - return result + return base def punycode_decode(text, errors): if isinstance(text, str): @@ -201,12 +192,22 @@ def punycode_decode(text, errors): text = bytes(text) pos = text.rfind(b"-") if pos == -1: - base = b"" + base = "" extended = text.upper() else: - base = text[:pos] + try: + base = str(text[:pos], "ascii", errors) + except UnicodeDecodeError as exc: + raise UnicodeDecodeError("ascii", text, exc.start, exc.end, + exc.reason) from None extended = text[pos+1:].upper() - return insertion_sort(base, extended, errors) + try: + return insertion_sort(base, extended, errors) + except UnicodeDecodeError as exc: + offset = pos + 1 + raise UnicodeDecodeError("punycode", text, + offset+exc.start, offset+exc.end, + exc.reason) from None ### Codec APIs diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index d067ae4fe90b89..008092830d251d 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -1356,17 +1356,17 @@ def test_decode(self): def test_decode_invalid(self): testcases = [ - (b"xn--w&", "strict", UnicodeDecodeError("punycode", b"xn--W&", 5, 6, "")), - (b"&egbpdaj6bu4bxfgehfvwxn", "strict", UnicodeDecodeError("punycode", b"&EGBPDAJ6BU4BXFGEHFVWXN", 0, 1, "")), - (b"egbpdaj6bu&4bx&fgehfvwxn", "strict", UnicodeDecodeError("punycode", b"EGBPDAJ6BU&4BX&FGEHFVWXN", 10, 11, "")), - (b"egbpdaj6bu4bxfgehfvwxn&", "strict", UnicodeDecodeError("punycode", b"EGBPDAJ6BU4BXFGEHFVWXN&", 22, 23, "")), - (b"\xFFProprostnemluvesky-uyb24dma41a", "strict", UnicodeDecodeError("ascii", b"\xFFProprostnemluvesky", 0, 1, "")), - (b"Pro\xFFprostnemluvesky-uyb24dma41a", "strict", UnicodeDecodeError("ascii", b"Pro\xFFprostnemluvesky", 3, 4, "")), - (b"Proprost&nemluvesky-uyb24&dma41a", "strict", UnicodeDecodeError("punycode", b"Proprost&nemluvesky-UYB24&DMA41A", 25, 26, "")), - (b"Proprostnemluvesky&-&uyb24dma41a", "strict", UnicodeDecodeError("punycode", b"Proprostnemluvesky&-&UYB24DMA41A", 20, 21, "")), - (b"Proprostnemluvesky-&uyb24dma41a", "strict", UnicodeDecodeError("punycode", b"Proprostnemluvesky-&UYB24DMA41A", 19, 20, "")), - (b"Proprostnemluvesky-uyb24d&ma41a", "strict", UnicodeDecodeError("punycode", b"Proprostnemluvesky-UYB24D&MA41A", 25, 26, "")), - (b"Proprostnemluvesky-uyb24dma41a&", "strict", UnicodeDecodeError("punycode", b"Proprostnemluvesky-UYB24DMA41A&", 30, 31, "")), + (b"xn--w&", "strict", UnicodeDecodeError("punycode", b"", 5, 6, "")), + (b"&egbpdaj6bu4bxfgehfvwxn", "strict", UnicodeDecodeError("punycode", b"", 0, 1, "")), + (b"egbpdaj6bu&4bx&fgehfvwxn", "strict", UnicodeDecodeError("punycode", b"", 10, 11, "")), + (b"egbpdaj6bu4bxfgehfvwxn&", "strict", UnicodeDecodeError("punycode", b"", 22, 23, "")), + (b"\xFFProprostnemluvesky-uyb24dma41a", "strict", UnicodeDecodeError("ascii", b"", 0, 1, "")), + (b"Pro\xFFprostnemluvesky-uyb24dma41a", "strict", UnicodeDecodeError("ascii", b"", 3, 4, "")), + (b"Proprost&nemluvesky-uyb24&dma41a", "strict", UnicodeDecodeError("punycode", b"", 25, 26, "")), + (b"Proprostnemluvesky&-&uyb24dma41a", "strict", UnicodeDecodeError("punycode", b"", 20, 21, "")), + (b"Proprostnemluvesky-&uyb24dma41a", "strict", UnicodeDecodeError("punycode", b"", 19, 20, "")), + (b"Proprostnemluvesky-uyb24d&ma41a", "strict", UnicodeDecodeError("punycode", b"", 25, 26, "")), + (b"Proprostnemluvesky-uyb24dma41a&", "strict", UnicodeDecodeError("punycode", b"", 30, 31, "")), (b"xn--w&", "ignore", "xn-"), ] for puny, errors, expected in testcases: @@ -1376,7 +1376,7 @@ def test_decode_invalid(self): puny.decode("punycode", errors) exc = cm.exception self.assertEqual(exc.encoding, expected.encoding) - self.assertEqual(exc.object, expected.object) + self.assertEqual(exc.object, puny) self.assertEqual(exc.start, expected.start) self.assertEqual(exc.end, expected.end) else: