From 365a6cb2e43b51a2a0fa80b8b21f489f3a397532 Mon Sep 17 00:00:00 2001
From: "Gregory P. Smith [Google]" <greg@krypto.org>
Date: Fri, 4 Nov 2022 09:29:46 +0000
Subject: [PATCH 1/8] gh-98433: Fix quadratic time idna decoding.

There was an unnecessary quadratic loop in idna decoding. This restores
the behavior to linear.

An early length check would still be a good idea given that DNS IDNA
label names cannot be more than 63 ASCII characters.
---
 Lib/encodings/idna.py                            |  3 ++-
 Lib/test/test_codecs.py                          | 16 ++++++++++++++++
 ...2022-11-04-09-29-36.gh-issue-98433.l76c5G.rst |  3 +++
 3 files changed, 21 insertions(+), 1 deletion(-)
 create mode 100644 Misc/NEWS.d/next/Security/2022-11-04-09-29-36.gh-issue-98433.l76c5G.rst

diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py
index ea4058512fe366..54ed582909e068 100644
--- a/Lib/encodings/idna.py
+++ b/Lib/encodings/idna.py
@@ -39,6 +39,7 @@ def nameprep(label):
 
     # Check bidi
     RandAL = [stringprep.in_table_d1(x) for x in label]
+    any_in_table_d2 = any(stringprep.in_table_d2(x) for x in label)
     for c in RandAL:
         if c:
             # There is a RandAL char in the string. Must perform further
@@ -47,7 +48,7 @@ def nameprep(label):
             # This is table C.8, which was already checked
             # 2) If a string contains any RandALCat character, the string
             # MUST NOT contain any LCat character.
-            if any(stringprep.in_table_d2(x) for x in label):
+            if any_in_table_d2:
                 raise UnicodeError("Violation of BIDI requirement 2")
 
             # 3) If a string contains any RandALCat character, a
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index 32a704f4e97e41..0929736a573a9a 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -3,6 +3,7 @@
 import io
 import locale
 import sys
+import time
 import unittest
 import encodings
 from unittest import mock
@@ -1552,6 +1553,21 @@ def test_builtin_encode(self):
         self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
         self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
 
+    def test_builtin_decode_length_limit(self):
+        get_time = time.process_time
+        if get_time() <= 0:  # some platforms like WASM lack process_time()
+            get_time = time.monotonic
+        # This was slow prior to GH-98433's quadratic loop being fixed.
+        # Before: 12s on a rpi4 --with-pydebug. After: 0.12s
+        with self.assertRaises(UnicodeError) as ctx:
+            start = get_time()
+            (b"xn--016c"+b"a"*1000).decode("idna")
+        seconds_to_decode_idna_length_fail = get_time() - start
+        self.assertIn("too long", str(ctx.exception))
+        self.assertLess(
+                elapsed_seconds, 4,
+                msg="idna decoding length failure took waaaay too long")
+
     def test_stream(self):
         r = codecs.getreader("idna")(io.BytesIO(b"abc"))
         r.read(3)
diff --git a/Misc/NEWS.d/next/Security/2022-11-04-09-29-36.gh-issue-98433.l76c5G.rst b/Misc/NEWS.d/next/Security/2022-11-04-09-29-36.gh-issue-98433.l76c5G.rst
new file mode 100644
index 00000000000000..290a8680f3ef88
--- /dev/null
+++ b/Misc/NEWS.d/next/Security/2022-11-04-09-29-36.gh-issue-98433.l76c5G.rst
@@ -0,0 +1,3 @@
+The IDNA codec decoder used on DNS hostnames no longer involves a quadratic
+algorithm. This prevents a potential CPU denial of service if an out-of-spec
+excessive length hostname involving bidirectional characters is decoded.

From 4bf248b97ab3c99b7864a272fe34741cddfea0c9 Mon Sep 17 00:00:00 2001
From: "Gregory P. Smith [Google]" <greg@krypto.org>
Date: Fri, 4 Nov 2022 10:14:15 +0000
Subject: [PATCH 2/8] drop the timing test, add an upfront limit.

---
 Lib/encodings/idna.py   |  5 +++++
 Lib/test/test_codecs.py | 15 ++-------------
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py
index 54ed582909e068..771d13c521963b 100644
--- a/Lib/encodings/idna.py
+++ b/Lib/encodings/idna.py
@@ -113,6 +113,11 @@ def ToUnicode(label):
             pure_ascii = True
         except UnicodeError:
             pure_ascii = False
+    if len(label) > 300:
+        # Per DNS, > 63. This leaves room for nameprep() to remove various
+        # characters while still preventing us from wasting CPU on decoding a
+        # big thing that'll just hit the actual <= 63 length limit in Step 6.
+        raise UnicodeError("label way too long")
     if not pure_ascii:
         # Step 2: Perform nameprep
         label = nameprep(label)
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index 0929736a573a9a..2c49256f4800ed 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -3,7 +3,6 @@
 import io
 import locale
 import sys
-import time
 import unittest
 import encodings
 from unittest import mock
@@ -1554,19 +1553,9 @@ def test_builtin_encode(self):
         self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
 
     def test_builtin_decode_length_limit(self):
-        get_time = time.process_time
-        if get_time() <= 0:  # some platforms like WASM lack process_time()
-            get_time = time.monotonic
-        # This was slow prior to GH-98433's quadratic loop being fixed.
-        # Before: 12s on a rpi4 --with-pydebug. After: 0.12s
         with self.assertRaises(UnicodeError) as ctx:
-            start = get_time()
-            (b"xn--016c"+b"a"*1000).decode("idna")
-        seconds_to_decode_idna_length_fail = get_time() - start
-        self.assertIn("too long", str(ctx.exception))
-        self.assertLess(
-                elapsed_seconds, 4,
-                msg="idna decoding length failure took waaaay too long")
+            (b"xn--016c"+b"a"*500).decode("idna")
+        self.assertIn("label way too long", str(ctx.exception))
 
     def test_stream(self):
         r = codecs.getreader("idna")(io.BytesIO(b"abc"))

From db498cb463b95d268c5eb91b041c6032dcd0df8a Mon Sep 17 00:00:00 2001
From: "Gregory P. Smith [Google]" <greg@krypto.org>
Date: Fri, 4 Nov 2022 10:56:27 +0000
Subject: [PATCH 3/8] Expand the label limit comments.

---
 Lib/encodings/idna.py   | 12 ++++++++----
 Lib/test/test_codecs.py |  2 +-
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py
index 771d13c521963b..d0328f23108420 100644
--- a/Lib/encodings/idna.py
+++ b/Lib/encodings/idna.py
@@ -113,10 +113,14 @@ def ToUnicode(label):
             pure_ascii = True
         except UnicodeError:
             pure_ascii = False
-    if len(label) > 300:
-        # Per DNS, > 63. This leaves room for nameprep() to remove various
-        # characters while still preventing us from wasting CPU on decoding a
-        # big thing that'll just hit the actual <= 63 length limit in Step 6.
+    if len(label) > 1000:
+        # This leaves ample room for nameprep() to remove Nothing characters
+        # while still preventing us from wasting CPU on decoding a big thing
+        # that'll just hit the actual <= 63 length limit in Step 6.
+        # See https://github.com/python/cpython/issues/98433.
+        # https://datatracker.ietf.org/doc/html/rfc5891#section-5.2
+        # doesn't specify an label size limit prior to NAMEPREP. But having
+        # one makes practical sense given the result is <= 63 characters.
         raise UnicodeError("label way too long")
     if not pure_ascii:
         # Step 2: Perform nameprep
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index 2c49256f4800ed..3a9e67597de6ec 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -1554,7 +1554,7 @@ def test_builtin_encode(self):
 
     def test_builtin_decode_length_limit(self):
         with self.assertRaises(UnicodeError) as ctx:
-            (b"xn--016c"+b"a"*500).decode("idna")
+            (b"xn--016c"+b"a"*1010).decode("idna")
         self.assertIn("label way too long", str(ctx.exception))
 
     def test_stream(self):

From be30a482515d1343821c72a757b73b89f33c9d8a Mon Sep 17 00:00:00 2001
From: "Gregory P. Smith [Google]" <greg@krypto.org>
Date: Fri, 4 Nov 2022 11:00:19 +0000
Subject: [PATCH 4/8] Also add a short "too long" test assert.

---
 Lib/test/test_codecs.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index 3a9e67597de6ec..5fc8e23bcc78f7 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -1556,6 +1556,9 @@ def test_builtin_decode_length_limit(self):
         with self.assertRaises(UnicodeError) as ctx:
             (b"xn--016c"+b"a"*1010).decode("idna")
         self.assertIn("label way too long", str(ctx.exception))
+        with self.assertRaises(UnicodeError) as ctx:
+            (b"xn--016c"+b"a"*70).decode("idna")
+        self.assertIn("too long", str(ctx.exception))
 
     def test_stream(self):
         r = codecs.getreader("idna")(io.BytesIO(b"abc"))

From 038bbcdb8187ce7bce0a4086b40f9466e8f42780 Mon Sep 17 00:00:00 2001
From: "Gregory P. Smith [Google]" <greg@krypto.org>
Date: Fri, 4 Nov 2022 19:44:46 +0000
Subject: [PATCH 5/8] Refactor into nicer code. Expand the NEWS entry.

---
 Lib/encodings/idna.py                         | 51 +++++++++----------
 Lib/test/test_codecs.py                       |  2 +-
 ...2-11-04-09-29-36.gh-issue-98433.l76c5G.rst | 16 ++++--
 3 files changed, 38 insertions(+), 31 deletions(-)

diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py
index d0328f23108420..a2a77cf3e1899d 100644
--- a/Lib/encodings/idna.py
+++ b/Lib/encodings/idna.py
@@ -39,24 +39,21 @@ def nameprep(label):
 
     # Check bidi
     RandAL = [stringprep.in_table_d1(x) for x in label]
-    any_in_table_d2 = any(stringprep.in_table_d2(x) for x in label)
-    for c in RandAL:
-        if c:
-            # There is a RandAL char in the string. Must perform further
-            # tests:
-            # 1) The characters in section 5.8 MUST be prohibited.
-            # This is table C.8, which was already checked
-            # 2) If a string contains any RandALCat character, the string
-            # MUST NOT contain any LCat character.
-            if any_in_table_d2:
-                raise UnicodeError("Violation of BIDI requirement 2")
-
-            # 3) If a string contains any RandALCat character, a
-            # RandALCat character MUST be the first character of the
-            # string, and a RandALCat character MUST be the last
-            # character of the string.
-            if not RandAL[0] or not RandAL[-1]:
-                raise UnicodeError("Violation of BIDI requirement 3")
+    if any(RandAL):
+        # There is a RandAL char in the string. Must perform further
+        # tests:
+        # 1) The characters in section 5.8 MUST be prohibited.
+        # This is table C.8, which was already checked
+        # 2) If a string contains any RandALCat character, the string
+        # MUST NOT contain any LCat character.
+        if any(stringprep.in_table_d2(x) for x in label):
+            raise UnicodeError("Violation of BIDI requirement 2")
+        # 3) If a string contains any RandALCat character, a
+        # RandALCat character MUST be the first character of the
+        # string, and a RandALCat character MUST be the last
+        # character of the string.
+        if not RandAL[0] or not RandAL[-1]:
+            raise UnicodeError("Violation of BIDI requirement 3")
 
     return label
 
@@ -104,6 +101,15 @@ def ToASCII(label):
     raise UnicodeError("label empty or too long")
 
 def ToUnicode(label):
+    if len(label) > 1000:
+        # Protection from https://github.com/python/cpython/issues/98433.
+        # https://datatracker.ietf.org/doc/html/rfc5894#section-6
+        # doesn't specify a label size limit prior to NAMEPREP. But having
+        # one makes practical sense.
+        # This leaves ample room for nameprep() to remove Nothing characters
+        # while still preventing us from wasting CPU decoding a big thing
+        # that'll just hit the actual <= 63 length limit in Step 6.
+        raise UnicodeError("label way too long")
     # Step 1: Check for ASCII
     if isinstance(label, bytes):
         pure_ascii = True
@@ -113,15 +119,6 @@ def ToUnicode(label):
             pure_ascii = True
         except UnicodeError:
             pure_ascii = False
-    if len(label) > 1000:
-        # This leaves ample room for nameprep() to remove Nothing characters
-        # while still preventing us from wasting CPU on decoding a big thing
-        # that'll just hit the actual <= 63 length limit in Step 6.
-        # See https://github.com/python/cpython/issues/98433.
-        # https://datatracker.ietf.org/doc/html/rfc5891#section-5.2
-        # doesn't specify an label size limit prior to NAMEPREP. But having
-        # one makes practical sense given the result is <= 63 characters.
-        raise UnicodeError("label way too long")
     if not pure_ascii:
         # Step 2: Perform nameprep
         label = nameprep(label)
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index 5fc8e23bcc78f7..dbbdd40bac32e4 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -1555,7 +1555,7 @@ def test_builtin_encode(self):
     def test_builtin_decode_length_limit(self):
         with self.assertRaises(UnicodeError) as ctx:
             (b"xn--016c"+b"a"*1010).decode("idna")
-        self.assertIn("label way too long", str(ctx.exception))
+        self.assertIn("way too long", str(ctx.exception))
         with self.assertRaises(UnicodeError) as ctx:
             (b"xn--016c"+b"a"*70).decode("idna")
         self.assertIn("too long", str(ctx.exception))
diff --git a/Misc/NEWS.d/next/Security/2022-11-04-09-29-36.gh-issue-98433.l76c5G.rst b/Misc/NEWS.d/next/Security/2022-11-04-09-29-36.gh-issue-98433.l76c5G.rst
index 290a8680f3ef88..531fbf953b5615 100644
--- a/Misc/NEWS.d/next/Security/2022-11-04-09-29-36.gh-issue-98433.l76c5G.rst
+++ b/Misc/NEWS.d/next/Security/2022-11-04-09-29-36.gh-issue-98433.l76c5G.rst
@@ -1,3 +1,13 @@
-The IDNA codec decoder used on DNS hostnames no longer involves a quadratic
-algorithm. This prevents a potential CPU denial of service if an out-of-spec
-excessive length hostname involving bidirectional characters is decoded.
+The IDNA codec decoder used on DNS hostnames by :mod:`socket` or :mod:`asyncio`
+related name resolution functions no longer involves a quadratic algorithm.
+This prevents a potential CPU denial of service if an out-of-spec excessive
+length hostname involving bidirectional characters were decoded. Some protocols
+such as :mod:`urllib` http ``3xx`` redirects potentially allow for an attacker
+to supply such a name.
+
+Individual labels within a DNS name will also now raise an error during IDNA
+decoding if they are longer than 1000 characters given that each decoded DNS
+label must be 63 or fewer characters. Only an application presenting a hostname
+value consisting primarily of "Nothing" characters to be removed would run into
+of this limit. Applications relying on this are not expected to exist. See
+:rfc:`5894` section 6 and :rfc:`3491`.

From cf3a20ec7b8fac342b4afd9d42a8571e0cc73452 Mon Sep 17 00:00:00 2001
From: "Gregory P. Smith [Google]" <greg@krypto.org>
Date: Fri, 4 Nov 2022 19:54:05 +0000
Subject: [PATCH 6/8] news wording tweak.

---
 .../2022-11-04-09-29-36.gh-issue-98433.l76c5G.rst    | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/Misc/NEWS.d/next/Security/2022-11-04-09-29-36.gh-issue-98433.l76c5G.rst b/Misc/NEWS.d/next/Security/2022-11-04-09-29-36.gh-issue-98433.l76c5G.rst
index 531fbf953b5615..8ad03ab961f003 100644
--- a/Misc/NEWS.d/next/Security/2022-11-04-09-29-36.gh-issue-98433.l76c5G.rst
+++ b/Misc/NEWS.d/next/Security/2022-11-04-09-29-36.gh-issue-98433.l76c5G.rst
@@ -5,9 +5,9 @@ length hostname involving bidirectional characters were decoded. Some protocols
 such as :mod:`urllib` http ``3xx`` redirects potentially allow for an attacker
 to supply such a name.
 
-Individual labels within a DNS name will also now raise an error during IDNA
-decoding if they are longer than 1000 characters given that each decoded DNS
-label must be 63 or fewer characters. Only an application presenting a hostname
-value consisting primarily of "Nothing" characters to be removed would run into
-of this limit. Applications relying on this are not expected to exist. See
-:rfc:`5894` section 6 and :rfc:`3491`.
+Individual labels within an IDNA encoded DNS name will now raise an error early
+during IDNA decoding if they are longer than 1000 encoded characters given that
+each decoded DNS label must be 63 or fewer characters. Only an application
+presenting a suspicious hostname value consisting primarily of "Nothing"
+characters to be removed would run into of this new limit. See :rfc:`5894`
+section 6 and :rfc:`3491`.

From bd51456952dedc5af1864b6595d97e78bf67a9c0 Mon Sep 17 00:00:00 2001
From: "Gregory P. Smith [Google]" <greg@krypto.org>
Date: Fri, 4 Nov 2022 20:26:02 +0000
Subject: [PATCH 7/8] more RFC links and explanation.

---
 Lib/encodings/idna.py                                 |  7 ++++---
 Lib/test/test_codecs.py                               |  2 +-
 .../2022-11-04-09-29-36.gh-issue-98433.l76c5G.rst     | 11 ++++++-----
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py
index a2a77cf3e1899d..5396047a7fb0b8 100644
--- a/Lib/encodings/idna.py
+++ b/Lib/encodings/idna.py
@@ -101,14 +101,15 @@ def ToASCII(label):
     raise UnicodeError("label empty or too long")
 
 def ToUnicode(label):
-    if len(label) > 1000:
+    if len(label) > 1024:
         # Protection from https://github.com/python/cpython/issues/98433.
         # https://datatracker.ietf.org/doc/html/rfc5894#section-6
         # doesn't specify a label size limit prior to NAMEPREP. But having
         # one makes practical sense.
         # This leaves ample room for nameprep() to remove Nothing characters
-        # while still preventing us from wasting CPU decoding a big thing
-        # that'll just hit the actual <= 63 length limit in Step 6.
+        # per https://www.rfc-editor.org/rfc/rfc3454#section-3.1 while still
+        # preventing us from wasting time decoding a big thing that'll just
+        # hit the actual <= 63 length limit in Step 6.
         raise UnicodeError("label way too long")
     # Step 1: Check for ASCII
     if isinstance(label, bytes):
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index dbbdd40bac32e4..edb763db6c6773 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -1554,7 +1554,7 @@ def test_builtin_encode(self):
 
     def test_builtin_decode_length_limit(self):
         with self.assertRaises(UnicodeError) as ctx:
-            (b"xn--016c"+b"a"*1010).decode("idna")
+            (b"xn--016c"+b"a"*1100).decode("idna")
         self.assertIn("way too long", str(ctx.exception))
         with self.assertRaises(UnicodeError) as ctx:
             (b"xn--016c"+b"a"*70).decode("idna")
diff --git a/Misc/NEWS.d/next/Security/2022-11-04-09-29-36.gh-issue-98433.l76c5G.rst b/Misc/NEWS.d/next/Security/2022-11-04-09-29-36.gh-issue-98433.l76c5G.rst
index 8ad03ab961f003..0d649dc6a9f10d 100644
--- a/Misc/NEWS.d/next/Security/2022-11-04-09-29-36.gh-issue-98433.l76c5G.rst
+++ b/Misc/NEWS.d/next/Security/2022-11-04-09-29-36.gh-issue-98433.l76c5G.rst
@@ -6,8 +6,9 @@ such as :mod:`urllib` http ``3xx`` redirects potentially allow for an attacker
 to supply such a name.
 
 Individual labels within an IDNA encoded DNS name will now raise an error early
-during IDNA decoding if they are longer than 1000 encoded characters given that
-each decoded DNS label must be 63 or fewer characters. Only an application
-presenting a suspicious hostname value consisting primarily of "Nothing"
-characters to be removed would run into of this new limit. See :rfc:`5894`
-section 6 and :rfc:`3491`.
+during IDNA decoding if they are longer than 1024 unicode characters given that
+each decoded DNS label must be 63 or fewer characters and the entire decoded
+DNS name is limited to 255. Only an application presenting a hostname or label
+consisting primarily of :rfc:`3454` section 3.1 "Nothing" characters to be
+removed would run into of this new limit. See also :rfc:`5894` section 6 and
+:rfc:`3491`.

From 5a3b03750f57429f7529c9e0e3603c07d38cedd9 Mon Sep 17 00:00:00 2001
From: "Gregory P. Smith [Google]" <greg@krypto.org>
Date: Mon, 7 Nov 2022 23:10:59 +0000
Subject: [PATCH 8/8] assertRaisesRegex

thanks victor!
---
 Lib/test/test_codecs.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index edb763db6c6773..e3add0c1ee926c 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -1553,12 +1553,10 @@ def test_builtin_encode(self):
         self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
 
     def test_builtin_decode_length_limit(self):
-        with self.assertRaises(UnicodeError) as ctx:
+        with self.assertRaisesRegex(UnicodeError, "way too long"):
             (b"xn--016c"+b"a"*1100).decode("idna")
-        self.assertIn("way too long", str(ctx.exception))
-        with self.assertRaises(UnicodeError) as ctx:
+        with self.assertRaisesRegex(UnicodeError, "too long"):
             (b"xn--016c"+b"a"*70).decode("idna")
-        self.assertIn("too long", str(ctx.exception))
 
     def test_stream(self):
         r = codecs.getreader("idna")(io.BytesIO(b"abc"))