From 6816eb3af3573bea081e2d18591ddf85a1cf9a85 Mon Sep 17 00:00:00 2001 From: Srinivas Reddy Thatiparthy Date: Thu, 25 Jun 2020 22:10:25 +0530 Subject: [PATCH 1/4] bpo-41115: Convert UnicodeError to UnicodeEncodeError| UnicodeDecodeError in idna.py, utf_16.py, utf_32.py, punycode.py, undefined.py modules. --- Lib/encodings/idna.py | 22 +++++++++---------- Lib/encodings/punycode.py | 21 ++++++++++-------- Lib/encodings/undefined.py | 22 +++++++++---------- Lib/encodings/utf_16.py | 6 ++--- Lib/encodings/utf_32.py | 4 ++-- Lib/test/test_codecs.py | 4 ++-- .../2020-06-26-06-03-42.bpo-41115.NTjUWO.rst | 2 ++ 7 files changed, 43 insertions(+), 38 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2020-06-26-06-03-42.bpo-41115.NTjUWO.rst diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py index ea4058512fe366..962afa82309f19 100644 --- a/Lib/encodings/idna.py +++ b/Lib/encodings/idna.py @@ -63,14 +63,14 @@ def ToASCII(label): try: # Step 1: try ASCII label = label.encode("ascii") - except UnicodeError: + except UnicodeEncodeError: pass else: # Skip to step 3: UseSTD3ASCIIRules is false, so # Skip to step 8. if 0 < len(label) < 64: return label - raise UnicodeError("label empty or too long") + raise UnicodeEncodeError("ascii", label, 0, len(label), "label empty or too long") # Step 2: nameprep label = nameprep(label) @@ -79,17 +79,17 @@ def ToASCII(label): # Step 4: try ASCII try: label = label.encode("ascii") - except UnicodeError: + except UnicodeEncodeError: pass else: # Skip to step 8. if 0 < len(label) < 64: return label - raise UnicodeError("label empty or too long") + raise UnicodeEncodeError("ascii", label, 0, len(label), "label empty or too long") # Step 5: Check ACE prefix if label.startswith(sace_prefix): - raise UnicodeError("Label starts with ACE prefix") + raise UnicodeEncodeError("ascii", label, 0, len(label), "Label starts with ACE prefix") # Step 6: Encode with PUNYCODE label = label.encode("punycode") @@ -100,7 +100,7 @@ def ToASCII(label): # Step 8: Check size if 0 < len(label) < 64: return label - raise UnicodeError("label empty or too long") + raise UnicodeEncodeError("punycode", label, 0, len(label), "label empty or too long") def ToUnicode(label): # Step 1: Check for ASCII @@ -110,7 +110,7 @@ def ToUnicode(label): try: label = label.encode("ascii") pure_ascii = True - except UnicodeError: + except UnicodeEncodeError: pure_ascii = False if not pure_ascii: # Step 2: Perform nameprep @@ -118,8 +118,8 @@ def ToUnicode(label): # It doesn't say this, but apparently, it should be ASCII now try: label = label.encode("ascii") - except UnicodeError: - raise UnicodeError("Invalid character in IDN label") + except UnicodeEncodeError: + raise UnicodeEncodeError("ascii", label, 0, len(label), "Invalid character in IDN label") # Step 3: Check for ACE prefix if not label.startswith(ace_prefix): return str(label, "ascii") @@ -162,9 +162,9 @@ def encode(self, input, errors='strict'): labels = result.split(b'.') for label in labels[:-1]: if not (0 < len(label) < 64): - raise UnicodeError("label empty or too long") + raise UnicodeEncodeError("ascii", label, 0, len(label), "label empty or too long") if len(labels[-1]) >= 64: - raise UnicodeError("label too long") + raise UnicodeEncodeError("ascii", labels[-1], 0, len(labels[-1]), "label too long") return result, len(input) result = bytearray() diff --git a/Lib/encodings/punycode.py b/Lib/encodings/punycode.py index 1c5726447077b1..0054f47efba8a5 100644 --- a/Lib/encodings/punycode.py +++ b/Lib/encodings/punycode.py @@ -1,4 +1,4 @@ -""" Codec for the Punicode encoding, as specified in RFC 3492 +""" Codec for the Punycode encoding, as specified in RFC 3492 Written by Martin v. Löwis. """ @@ -74,7 +74,9 @@ def T(j, bias): if res > 26: return 26 return res + digits = b"abcdefghijklmnopqrstuvwxyz0123456789" + def generate_generalized_integer(N, bias): """3.3 Generalized variable-length integers""" result = bytearray() @@ -111,7 +113,7 @@ def generate_integers(baselen, deltas): for points, delta in enumerate(deltas): s = generate_generalized_integer(delta, bias) result.extend(s) - bias = adapt(delta, points==0, baselen+points+1) + bias = adapt(delta, points == 0, baselen+points+1) return bytes(result) def punycode_encode(text): @@ -134,7 +136,8 @@ def decode_generalized_number(extended, extpos, bias, errors): char = ord(extended[extpos]) except IndexError: if errors == "strict": - raise UnicodeError("incomplete punicode string") + raise UnicodeDecodeError("punycode", bytes(extended[extpos], "utf-8"), extpos, extpos+1, + "incomplete punycode string") return extpos + 1, None extpos += 1 if 0x41 <= char <= 0x5A: # A-Z @@ -142,8 +145,8 @@ def decode_generalized_number(extended, extpos, bias, errors): elif 0x30 <= char <= 0x39: digit = char - 22 # 0x30-26 elif errors == "strict": - raise UnicodeError("Invalid extended code point '%s'" - % extended[extpos-1]) + raise UnicodeDecodeError("punycode", bytes(extended[extpos-1], "utf-8"), extpos-1, extpos, + "Invalid extended code point '%s'" % extended[extpos-1]) else: return extpos, None t = T(j, bias) @@ -171,7 +174,7 @@ def insertion_sort(base, extended, errors): char += pos // (len(base) + 1) if char > 0x10FFFF: if errors == "strict": - raise UnicodeError("Invalid character U+%x" % char) + raise UnicodeDecodeError("punycode", bytes(char, "utf-8"), 0, len(char), "Invalid character U+%x" % char) char = ord('?') pos = pos % (len(base) + 1) base = base[:pos] + chr(char) + base[pos:] @@ -217,13 +220,13 @@ def decode(self, input, final=False): raise UnicodeError("Unsupported error handling "+self.errors) return punycode_decode(input, self.errors) -class StreamWriter(Codec,codecs.StreamWriter): +class StreamWriter(Codec, codecs.StreamWriter): pass -class StreamReader(Codec,codecs.StreamReader): +class StreamReader(Codec, codecs.StreamReader): pass -### encodings module API +# encodings module API def getregentry(): return codecs.CodecInfo( diff --git a/Lib/encodings/undefined.py b/Lib/encodings/undefined.py index 4690288355c710..269a06c8160fe9 100644 --- a/Lib/encodings/undefined.py +++ b/Lib/encodings/undefined.py @@ -1,8 +1,8 @@ """ Python 'undefined' Codec - This codec will always raise a ValueError exception when being - used. It is intended for use by the site.py file to switch off - automatic string to Unicode coercion. + This codec will always raise a UnicodeEncodeError | UnicodeDecodeError + exception when being used. It is intended for use by the site.py file + to switch off automatic string to Unicode coercion. Written by Marc-Andre Lemburg (mal@lemburg.com). @@ -15,24 +15,24 @@ class Codec(codecs.Codec): - def encode(self,input,errors='strict'): - raise UnicodeError("undefined encoding") + def encode(self, input, errors='strict'): + raise UnicodeEncodeError("undefined", str(input), 0, len(input), "undefined encoding") - def decode(self,input,errors='strict'): - raise UnicodeError("undefined encoding") + def decode(self, input, errors='strict'): + raise UnicodeDecodeError("undefined", bytes(input), 0, len(input), "undefined decoding") class IncrementalEncoder(codecs.IncrementalEncoder): def encode(self, input, final=False): - raise UnicodeError("undefined encoding") + raise UnicodeEncodeError("undefined", str(input), 0, len(input), "undefined encoding") class IncrementalDecoder(codecs.IncrementalDecoder): def decode(self, input, final=False): - raise UnicodeError("undefined encoding") + raise UnicodeDecodeError("undefined", bytes(input), 0, len(input), "undefined decoding") -class StreamWriter(Codec,codecs.StreamWriter): +class StreamWriter(Codec, codecs.StreamWriter): pass -class StreamReader(Codec,codecs.StreamReader): +class StreamReader(Codec, codecs.StreamReader): pass ### encodings module API diff --git a/Lib/encodings/utf_16.py b/Lib/encodings/utf_16.py index c61248242be8c7..4e641bd03aef38 100644 --- a/Lib/encodings/utf_16.py +++ b/Lib/encodings/utf_16.py @@ -64,7 +64,7 @@ def _buffer_decode(self, input, errors, final): elif byteorder == 1: self.decoder = codecs.utf_16_be_decode elif consumed >= 2: - raise UnicodeError("UTF-16 stream does not start with BOM") + raise UnicodeDecodeError("utc-16", input, 0, 0, "UTF-16 stream does not start with BOM") return (output, consumed) return self.decoder(input, self.errors, final) @@ -137,8 +137,8 @@ def decode(self, input, errors='strict'): self.decode = codecs.utf_16_le_decode elif byteorder == 1: self.decode = codecs.utf_16_be_decode - elif consumed>=2: - raise UnicodeError("UTF-16 stream does not start with BOM") + elif consumed >= 2: + raise UnicodeDecodeError("utf-16", input, 0, 0, "UTF-16 stream does not start with BOM") return (object, consumed) ### encodings module API diff --git a/Lib/encodings/utf_32.py b/Lib/encodings/utf_32.py index cdf84d14129a62..c4c1e2ccfa5279 100644 --- a/Lib/encodings/utf_32.py +++ b/Lib/encodings/utf_32.py @@ -59,7 +59,7 @@ def _buffer_decode(self, input, errors, final): elif byteorder == 1: self.decoder = codecs.utf_32_be_decode elif consumed >= 4: - raise UnicodeError("UTF-32 stream does not start with BOM") + raise UnicodeDecodeError("utf-32", input, 0, 0, "UTF-32 stream does not start with BOM") return (output, consumed) return self.decoder(input, self.errors, final) @@ -133,7 +133,7 @@ def decode(self, input, errors='strict'): elif byteorder == 1: self.decode = codecs.utf_32_be_decode elif consumed>=4: - raise UnicodeError("UTF-32 stream does not start with BOM") + raise UnicodeDecodeError("utf-32", input, 0, 0, "UTF-32 stream does not start with BOM") return (object, consumed) ### encodings module API diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 54a3520802a4f3..9f856dcee3598d 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -1334,13 +1334,13 @@ def test_decode(self): def test_decode_invalid(self): testcases = [ - (b"xn--w&", "strict", UnicodeError()), + (b"xn--w&", "strict", UnicodeDecodeError("punycode", b"xn--w&", 0, 0, "")), (b"xn--w&", "ignore", "xn-"), ] for puny, errors, expected in testcases: with self.subTest(puny=puny, errors=errors): if isinstance(expected, Exception): - self.assertRaises(UnicodeError, puny.decode, "punycode", errors) + self.assertRaises(UnicodeDecodeError, puny.decode, "punycode", errors) else: self.assertEqual(puny.decode("punycode", errors), expected) diff --git a/Misc/NEWS.d/next/Library/2020-06-26-06-03-42.bpo-41115.NTjUWO.rst b/Misc/NEWS.d/next/Library/2020-06-26-06-03-42.bpo-41115.NTjUWO.rst new file mode 100644 index 00000000000000..bf2203e0a1ba4e --- /dev/null +++ b/Misc/NEWS.d/next/Library/2020-06-26-06-03-42.bpo-41115.NTjUWO.rst @@ -0,0 +1,2 @@ +Convert :exc: UnicodeError to :exc: UnicodeEncodeError or :exc: UnicodeDecodeError where appropriate. +Patch by Srinivas Reddy Thatiparthy \ No newline at end of file From 0d2420741eb75f062b869a4dc289f263257045a8 Mon Sep 17 00:00:00 2001 From: Srinivas Reddy Thatiparthy Date: Fri, 26 Jun 2020 11:57:01 +0530 Subject: [PATCH 2/4] bpo-41115: Convert label to str(label) --- Lib/encodings/idna.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py index 962afa82309f19..14485a269a8093 100644 --- a/Lib/encodings/idna.py +++ b/Lib/encodings/idna.py @@ -70,7 +70,7 @@ def ToASCII(label): # Skip to step 8. if 0 < len(label) < 64: return label - raise UnicodeEncodeError("ascii", label, 0, len(label), "label empty or too long") + raise UnicodeEncodeError("ascii", str(label), 0, len(label), "label empty or too long") # Step 2: nameprep label = nameprep(label) @@ -85,11 +85,11 @@ def ToASCII(label): # Skip to step 8. if 0 < len(label) < 64: return label - raise UnicodeEncodeError("ascii", label, 0, len(label), "label empty or too long") + raise UnicodeEncodeError("ascii", str(label), 0, len(label), "label empty or too long") # Step 5: Check ACE prefix if label.startswith(sace_prefix): - raise UnicodeEncodeError("ascii", label, 0, len(label), "Label starts with ACE prefix") + raise UnicodeEncodeError("ascii", str(label), 0, len(label), "Label starts with ACE prefix") # Step 6: Encode with PUNYCODE label = label.encode("punycode") @@ -100,7 +100,7 @@ def ToASCII(label): # Step 8: Check size if 0 < len(label) < 64: return label - raise UnicodeEncodeError("punycode", label, 0, len(label), "label empty or too long") + raise UnicodeEncodeError("punycode", str(label), 0, len(label), "label empty or too long") def ToUnicode(label): # Step 1: Check for ASCII @@ -162,9 +162,9 @@ def encode(self, input, errors='strict'): labels = result.split(b'.') for label in labels[:-1]: if not (0 < len(label) < 64): - raise UnicodeEncodeError("ascii", label, 0, len(label), "label empty or too long") + raise UnicodeEncodeError("ascii", str(label), 0, len(label), "label empty or too long") if len(labels[-1]) >= 64: - raise UnicodeEncodeError("ascii", labels[-1], 0, len(labels[-1]), "label too long") + raise UnicodeEncodeError("ascii", str(labels[-1]), 0, len(labels[-1]), "label too long") return result, len(input) result = bytearray() From dd44d595e98c806e176176e5c58d0ced38153d38 Mon Sep 17 00:00:00 2001 From: Srinivas Reddy Thatiparthy Date: Fri, 26 Jun 2020 12:15:53 +0530 Subject: [PATCH 3/4] bpo-41115: Fix build failures in idna.py --- Lib/encodings/idna.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py index 14485a269a8093..bf20a3974c54b3 100644 --- a/Lib/encodings/idna.py +++ b/Lib/encodings/idna.py @@ -162,9 +162,11 @@ def encode(self, input, errors='strict'): labels = result.split(b'.') for label in labels[:-1]: if not (0 < len(label) < 64): - raise UnicodeEncodeError("ascii", str(label), 0, len(label), "label empty or too long") + raise UnicodeEncodeError("ascii", label.decode('ascii'), 0, len(label.decode('ascii')), + "label empty or too long") if len(labels[-1]) >= 64: - raise UnicodeEncodeError("ascii", str(labels[-1]), 0, len(labels[-1]), "label too long") + raise UnicodeEncodeError("ascii", labels[-1].decode('ascii'), 0, len(labels[-1].decode('ascii')), + "label too long") return result, len(input) result = bytearray() From 64778d2d025344a1dfba5cb039f81dae0be34577 Mon Sep 17 00:00:00 2001 From: Srinivas Reddy Thatiparthy Date: Fri, 26 Jun 2020 12:39:06 +0530 Subject: [PATCH 4/4] bpo-41115:Fix doc failure --- Lib/encodings/idna.py | 14 +++++++++----- .../2020-06-26-06-03-42.bpo-41115.NTjUWO.rst | 2 +- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py index bf20a3974c54b3..dc70838069431c 100644 --- a/Lib/encodings/idna.py +++ b/Lib/encodings/idna.py @@ -70,7 +70,8 @@ def ToASCII(label): # Skip to step 8. if 0 < len(label) < 64: return label - raise UnicodeEncodeError("ascii", str(label), 0, len(label), "label empty or too long") + raise UnicodeEncodeError("ascii", label.decode("ascii"), 0, len(label.decode("ascii")), + "label empty or too long") # Step 2: nameprep label = nameprep(label) @@ -85,7 +86,8 @@ def ToASCII(label): # Skip to step 8. if 0 < len(label) < 64: return label - raise UnicodeEncodeError("ascii", str(label), 0, len(label), "label empty or too long") + raise UnicodeEncodeError("ascii", label.decode("ascii"), 0, len(label.decode("ascii")), + "label empty or too long") # Step 5: Check ACE prefix if label.startswith(sace_prefix): @@ -98,9 +100,10 @@ def ToASCII(label): label = ace_prefix + label # Step 8: Check size - if 0 < len(label) < 64: + if len(label) < 64: return label - raise UnicodeEncodeError("punycode", str(label), 0, len(label), "label empty or too long") + raise UnicodeEncodeError("punycode", label.decode("punycode"), 0, + len(label.decode("punycode")), "label too long") def ToUnicode(label): # Step 1: Check for ASCII @@ -119,7 +122,8 @@ def ToUnicode(label): try: label = label.encode("ascii") except UnicodeEncodeError: - raise UnicodeEncodeError("ascii", label, 0, len(label), "Invalid character in IDN label") + raise UnicodeEncodeError("ascii", label.decode("ascii"), 0, len(label.decode("ascii")), + "Invalid character in IDN label") # Step 3: Check for ACE prefix if not label.startswith(ace_prefix): return str(label, "ascii") diff --git a/Misc/NEWS.d/next/Library/2020-06-26-06-03-42.bpo-41115.NTjUWO.rst b/Misc/NEWS.d/next/Library/2020-06-26-06-03-42.bpo-41115.NTjUWO.rst index bf2203e0a1ba4e..9827897d709b68 100644 --- a/Misc/NEWS.d/next/Library/2020-06-26-06-03-42.bpo-41115.NTjUWO.rst +++ b/Misc/NEWS.d/next/Library/2020-06-26-06-03-42.bpo-41115.NTjUWO.rst @@ -1,2 +1,2 @@ -Convert :exc: UnicodeError to :exc: UnicodeEncodeError or :exc: UnicodeDecodeError where appropriate. +Convert :exc:`UnicodeError` to :exc:`UnicodeEncodeError` or :exc:`UnicodeDecodeError` where appropriate. Patch by Srinivas Reddy Thatiparthy \ No newline at end of file