From 365a6cb2e43b51a2a0fa80b8b21f489f3a397532 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith [Google]" Date: Fri, 4 Nov 2022 09:29:46 +0000 Subject: [PATCH 1/8] gh-98433: Fix quadratic time idna decoding. There was an unnecessary quadratic loop in idna decoding. This restores the behavior to linear. An early length check would still be a good idea given that DNS IDNA label names cannot be more than 63 ASCII characters. --- Lib/encodings/idna.py | 3 ++- Lib/test/test_codecs.py | 16 ++++++++++++++++ ...2022-11-04-09-29-36.gh-issue-98433.l76c5G.rst | 3 +++ 3 files changed, 21 insertions(+), 1 deletion(-) create mode 100644 Misc/NEWS.d/next/Security/2022-11-04-09-29-36.gh-issue-98433.l76c5G.rst diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py index ea4058512fe366..54ed582909e068 100644 --- a/Lib/encodings/idna.py +++ b/Lib/encodings/idna.py @@ -39,6 +39,7 @@ def nameprep(label): # Check bidi RandAL = [stringprep.in_table_d1(x) for x in label] + any_in_table_d2 = any(stringprep.in_table_d2(x) for x in label) for c in RandAL: if c: # There is a RandAL char in the string. Must perform further @@ -47,7 +48,7 @@ def nameprep(label): # This is table C.8, which was already checked # 2) If a string contains any RandALCat character, the string # MUST NOT contain any LCat character. - if any(stringprep.in_table_d2(x) for x in label): + if any_in_table_d2: raise UnicodeError("Violation of BIDI requirement 2") # 3) If a string contains any RandALCat character, a diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 32a704f4e97e41..0929736a573a9a 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -3,6 +3,7 @@ import io import locale import sys +import time import unittest import encodings from unittest import mock @@ -1552,6 +1553,21 @@ def test_builtin_encode(self): self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org") self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.") + def test_builtin_decode_length_limit(self): + get_time = time.process_time + if get_time() <= 0: # some platforms like WASM lack process_time() + get_time = time.monotonic + # This was slow prior to GH-98433's quadratic loop being fixed. + # Before: 12s on a rpi4 --with-pydebug. After: 0.12s + with self.assertRaises(UnicodeError) as ctx: + start = get_time() + (b"xn--016c"+b"a"*1000).decode("idna") + seconds_to_decode_idna_length_fail = get_time() - start + self.assertIn("too long", str(ctx.exception)) + self.assertLess( + elapsed_seconds, 4, + msg="idna decoding length failure took waaaay too long") + def test_stream(self): r = codecs.getreader("idna")(io.BytesIO(b"abc")) r.read(3) diff --git a/Misc/NEWS.d/next/Security/2022-11-04-09-29-36.gh-issue-98433.l76c5G.rst b/Misc/NEWS.d/next/Security/2022-11-04-09-29-36.gh-issue-98433.l76c5G.rst new file mode 100644 index 00000000000000..290a8680f3ef88 --- /dev/null +++ b/Misc/NEWS.d/next/Security/2022-11-04-09-29-36.gh-issue-98433.l76c5G.rst @@ -0,0 +1,3 @@ +The IDNA codec decoder used on DNS hostnames no longer involves a quadratic +algorithm. This prevents a potential CPU denial of service if an out-of-spec +excessive length hostname involving bidirectional characters is decoded. From 4bf248b97ab3c99b7864a272fe34741cddfea0c9 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith [Google]" Date: Fri, 4 Nov 2022 10:14:15 +0000 Subject: [PATCH 2/8] drop the timing test, add an upfront limit. --- Lib/encodings/idna.py | 5 +++++ Lib/test/test_codecs.py | 15 ++------------- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py index 54ed582909e068..771d13c521963b 100644 --- a/Lib/encodings/idna.py +++ b/Lib/encodings/idna.py @@ -113,6 +113,11 @@ def ToUnicode(label): pure_ascii = True except UnicodeError: pure_ascii = False + if len(label) > 300: + # Per DNS, > 63. This leaves room for nameprep() to remove various + # characters while still preventing us from wasting CPU on decoding a + # big thing that'll just hit the actual <= 63 length limit in Step 6. + raise UnicodeError("label way too long") if not pure_ascii: # Step 2: Perform nameprep label = nameprep(label) diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 0929736a573a9a..2c49256f4800ed 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -3,7 +3,6 @@ import io import locale import sys -import time import unittest import encodings from unittest import mock @@ -1554,19 +1553,9 @@ def test_builtin_encode(self): self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.") def test_builtin_decode_length_limit(self): - get_time = time.process_time - if get_time() <= 0: # some platforms like WASM lack process_time() - get_time = time.monotonic - # This was slow prior to GH-98433's quadratic loop being fixed. - # Before: 12s on a rpi4 --with-pydebug. After: 0.12s with self.assertRaises(UnicodeError) as ctx: - start = get_time() - (b"xn--016c"+b"a"*1000).decode("idna") - seconds_to_decode_idna_length_fail = get_time() - start - self.assertIn("too long", str(ctx.exception)) - self.assertLess( - elapsed_seconds, 4, - msg="idna decoding length failure took waaaay too long") + (b"xn--016c"+b"a"*500).decode("idna") + self.assertIn("label way too long", str(ctx.exception)) def test_stream(self): r = codecs.getreader("idna")(io.BytesIO(b"abc")) From db498cb463b95d268c5eb91b041c6032dcd0df8a Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith [Google]" Date: Fri, 4 Nov 2022 10:56:27 +0000 Subject: [PATCH 3/8] Expand the label limit comments. --- Lib/encodings/idna.py | 12 ++++++++---- Lib/test/test_codecs.py | 2 +- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py index 771d13c521963b..d0328f23108420 100644 --- a/Lib/encodings/idna.py +++ b/Lib/encodings/idna.py @@ -113,10 +113,14 @@ def ToUnicode(label): pure_ascii = True except UnicodeError: pure_ascii = False - if len(label) > 300: - # Per DNS, > 63. This leaves room for nameprep() to remove various - # characters while still preventing us from wasting CPU on decoding a - # big thing that'll just hit the actual <= 63 length limit in Step 6. + if len(label) > 1000: + # This leaves ample room for nameprep() to remove Nothing characters + # while still preventing us from wasting CPU on decoding a big thing + # that'll just hit the actual <= 63 length limit in Step 6. + # See https://github.com/python/cpython/issues/98433. + # https://datatracker.ietf.org/doc/html/rfc5891#section-5.2 + # doesn't specify an label size limit prior to NAMEPREP. But having + # one makes practical sense given the result is <= 63 characters. raise UnicodeError("label way too long") if not pure_ascii: # Step 2: Perform nameprep diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 2c49256f4800ed..3a9e67597de6ec 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -1554,7 +1554,7 @@ def test_builtin_encode(self): def test_builtin_decode_length_limit(self): with self.assertRaises(UnicodeError) as ctx: - (b"xn--016c"+b"a"*500).decode("idna") + (b"xn--016c"+b"a"*1010).decode("idna") self.assertIn("label way too long", str(ctx.exception)) def test_stream(self): From be30a482515d1343821c72a757b73b89f33c9d8a Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith [Google]" Date: Fri, 4 Nov 2022 11:00:19 +0000 Subject: [PATCH 4/8] Also add a short "too long" test assert. --- Lib/test/test_codecs.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 3a9e67597de6ec..5fc8e23bcc78f7 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -1556,6 +1556,9 @@ def test_builtin_decode_length_limit(self): with self.assertRaises(UnicodeError) as ctx: (b"xn--016c"+b"a"*1010).decode("idna") self.assertIn("label way too long", str(ctx.exception)) + with self.assertRaises(UnicodeError) as ctx: + (b"xn--016c"+b"a"*70).decode("idna") + self.assertIn("too long", str(ctx.exception)) def test_stream(self): r = codecs.getreader("idna")(io.BytesIO(b"abc")) From 038bbcdb8187ce7bce0a4086b40f9466e8f42780 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith [Google]" Date: Fri, 4 Nov 2022 19:44:46 +0000 Subject: [PATCH 5/8] Refactor into nicer code. Expand the NEWS entry. --- Lib/encodings/idna.py | 51 +++++++++---------- Lib/test/test_codecs.py | 2 +- ...2-11-04-09-29-36.gh-issue-98433.l76c5G.rst | 16 ++++-- 3 files changed, 38 insertions(+), 31 deletions(-) diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py index d0328f23108420..a2a77cf3e1899d 100644 --- a/Lib/encodings/idna.py +++ b/Lib/encodings/idna.py @@ -39,24 +39,21 @@ def nameprep(label): # Check bidi RandAL = [stringprep.in_table_d1(x) for x in label] - any_in_table_d2 = any(stringprep.in_table_d2(x) for x in label) - for c in RandAL: - if c: - # There is a RandAL char in the string. Must perform further - # tests: - # 1) The characters in section 5.8 MUST be prohibited. - # This is table C.8, which was already checked - # 2) If a string contains any RandALCat character, the string - # MUST NOT contain any LCat character. - if any_in_table_d2: - raise UnicodeError("Violation of BIDI requirement 2") - - # 3) If a string contains any RandALCat character, a - # RandALCat character MUST be the first character of the - # string, and a RandALCat character MUST be the last - # character of the string. - if not RandAL[0] or not RandAL[-1]: - raise UnicodeError("Violation of BIDI requirement 3") + if any(RandAL): + # There is a RandAL char in the string. Must perform further + # tests: + # 1) The characters in section 5.8 MUST be prohibited. + # This is table C.8, which was already checked + # 2) If a string contains any RandALCat character, the string + # MUST NOT contain any LCat character. + if any(stringprep.in_table_d2(x) for x in label): + raise UnicodeError("Violation of BIDI requirement 2") + # 3) If a string contains any RandALCat character, a + # RandALCat character MUST be the first character of the + # string, and a RandALCat character MUST be the last + # character of the string. + if not RandAL[0] or not RandAL[-1]: + raise UnicodeError("Violation of BIDI requirement 3") return label @@ -104,6 +101,15 @@ def ToASCII(label): raise UnicodeError("label empty or too long") def ToUnicode(label): + if len(label) > 1000: + # Protection from https://github.com/python/cpython/issues/98433. + # https://datatracker.ietf.org/doc/html/rfc5894#section-6 + # doesn't specify a label size limit prior to NAMEPREP. But having + # one makes practical sense. + # This leaves ample room for nameprep() to remove Nothing characters + # while still preventing us from wasting CPU decoding a big thing + # that'll just hit the actual <= 63 length limit in Step 6. + raise UnicodeError("label way too long") # Step 1: Check for ASCII if isinstance(label, bytes): pure_ascii = True @@ -113,15 +119,6 @@ def ToUnicode(label): pure_ascii = True except UnicodeError: pure_ascii = False - if len(label) > 1000: - # This leaves ample room for nameprep() to remove Nothing characters - # while still preventing us from wasting CPU on decoding a big thing - # that'll just hit the actual <= 63 length limit in Step 6. - # See https://github.com/python/cpython/issues/98433. - # https://datatracker.ietf.org/doc/html/rfc5891#section-5.2 - # doesn't specify an label size limit prior to NAMEPREP. But having - # one makes practical sense given the result is <= 63 characters. - raise UnicodeError("label way too long") if not pure_ascii: # Step 2: Perform nameprep label = nameprep(label) diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 5fc8e23bcc78f7..dbbdd40bac32e4 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -1555,7 +1555,7 @@ def test_builtin_encode(self): def test_builtin_decode_length_limit(self): with self.assertRaises(UnicodeError) as ctx: (b"xn--016c"+b"a"*1010).decode("idna") - self.assertIn("label way too long", str(ctx.exception)) + self.assertIn("way too long", str(ctx.exception)) with self.assertRaises(UnicodeError) as ctx: (b"xn--016c"+b"a"*70).decode("idna") self.assertIn("too long", str(ctx.exception)) diff --git a/Misc/NEWS.d/next/Security/2022-11-04-09-29-36.gh-issue-98433.l76c5G.rst b/Misc/NEWS.d/next/Security/2022-11-04-09-29-36.gh-issue-98433.l76c5G.rst index 290a8680f3ef88..531fbf953b5615 100644 --- a/Misc/NEWS.d/next/Security/2022-11-04-09-29-36.gh-issue-98433.l76c5G.rst +++ b/Misc/NEWS.d/next/Security/2022-11-04-09-29-36.gh-issue-98433.l76c5G.rst @@ -1,3 +1,13 @@ -The IDNA codec decoder used on DNS hostnames no longer involves a quadratic -algorithm. This prevents a potential CPU denial of service if an out-of-spec -excessive length hostname involving bidirectional characters is decoded. +The IDNA codec decoder used on DNS hostnames by :mod:`socket` or :mod:`asyncio` +related name resolution functions no longer involves a quadratic algorithm. +This prevents a potential CPU denial of service if an out-of-spec excessive +length hostname involving bidirectional characters were decoded. Some protocols +such as :mod:`urllib` http ``3xx`` redirects potentially allow for an attacker +to supply such a name. + +Individual labels within a DNS name will also now raise an error during IDNA +decoding if they are longer than 1000 characters given that each decoded DNS +label must be 63 or fewer characters. Only an application presenting a hostname +value consisting primarily of "Nothing" characters to be removed would run into +of this limit. Applications relying on this are not expected to exist. See +:rfc:`5894` section 6 and :rfc:`3491`. From cf3a20ec7b8fac342b4afd9d42a8571e0cc73452 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith [Google]" Date: Fri, 4 Nov 2022 19:54:05 +0000 Subject: [PATCH 6/8] news wording tweak. --- .../2022-11-04-09-29-36.gh-issue-98433.l76c5G.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Misc/NEWS.d/next/Security/2022-11-04-09-29-36.gh-issue-98433.l76c5G.rst b/Misc/NEWS.d/next/Security/2022-11-04-09-29-36.gh-issue-98433.l76c5G.rst index 531fbf953b5615..8ad03ab961f003 100644 --- a/Misc/NEWS.d/next/Security/2022-11-04-09-29-36.gh-issue-98433.l76c5G.rst +++ b/Misc/NEWS.d/next/Security/2022-11-04-09-29-36.gh-issue-98433.l76c5G.rst @@ -5,9 +5,9 @@ length hostname involving bidirectional characters were decoded. Some protocols such as :mod:`urllib` http ``3xx`` redirects potentially allow for an attacker to supply such a name. -Individual labels within a DNS name will also now raise an error during IDNA -decoding if they are longer than 1000 characters given that each decoded DNS -label must be 63 or fewer characters. Only an application presenting a hostname -value consisting primarily of "Nothing" characters to be removed would run into -of this limit. Applications relying on this are not expected to exist. See -:rfc:`5894` section 6 and :rfc:`3491`. +Individual labels within an IDNA encoded DNS name will now raise an error early +during IDNA decoding if they are longer than 1000 encoded characters given that +each decoded DNS label must be 63 or fewer characters. Only an application +presenting a suspicious hostname value consisting primarily of "Nothing" +characters to be removed would run into of this new limit. See :rfc:`5894` +section 6 and :rfc:`3491`. From bd51456952dedc5af1864b6595d97e78bf67a9c0 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith [Google]" Date: Fri, 4 Nov 2022 20:26:02 +0000 Subject: [PATCH 7/8] more RFC links and explanation. --- Lib/encodings/idna.py | 7 ++++--- Lib/test/test_codecs.py | 2 +- .../2022-11-04-09-29-36.gh-issue-98433.l76c5G.rst | 11 ++++++----- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py index a2a77cf3e1899d..5396047a7fb0b8 100644 --- a/Lib/encodings/idna.py +++ b/Lib/encodings/idna.py @@ -101,14 +101,15 @@ def ToASCII(label): raise UnicodeError("label empty or too long") def ToUnicode(label): - if len(label) > 1000: + if len(label) > 1024: # Protection from https://github.com/python/cpython/issues/98433. # https://datatracker.ietf.org/doc/html/rfc5894#section-6 # doesn't specify a label size limit prior to NAMEPREP. But having # one makes practical sense. # This leaves ample room for nameprep() to remove Nothing characters - # while still preventing us from wasting CPU decoding a big thing - # that'll just hit the actual <= 63 length limit in Step 6. + # per https://www.rfc-editor.org/rfc/rfc3454#section-3.1 while still + # preventing us from wasting time decoding a big thing that'll just + # hit the actual <= 63 length limit in Step 6. raise UnicodeError("label way too long") # Step 1: Check for ASCII if isinstance(label, bytes): diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index dbbdd40bac32e4..edb763db6c6773 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -1554,7 +1554,7 @@ def test_builtin_encode(self): def test_builtin_decode_length_limit(self): with self.assertRaises(UnicodeError) as ctx: - (b"xn--016c"+b"a"*1010).decode("idna") + (b"xn--016c"+b"a"*1100).decode("idna") self.assertIn("way too long", str(ctx.exception)) with self.assertRaises(UnicodeError) as ctx: (b"xn--016c"+b"a"*70).decode("idna") diff --git a/Misc/NEWS.d/next/Security/2022-11-04-09-29-36.gh-issue-98433.l76c5G.rst b/Misc/NEWS.d/next/Security/2022-11-04-09-29-36.gh-issue-98433.l76c5G.rst index 8ad03ab961f003..0d649dc6a9f10d 100644 --- a/Misc/NEWS.d/next/Security/2022-11-04-09-29-36.gh-issue-98433.l76c5G.rst +++ b/Misc/NEWS.d/next/Security/2022-11-04-09-29-36.gh-issue-98433.l76c5G.rst @@ -6,8 +6,9 @@ such as :mod:`urllib` http ``3xx`` redirects potentially allow for an attacker to supply such a name. Individual labels within an IDNA encoded DNS name will now raise an error early -during IDNA decoding if they are longer than 1000 encoded characters given that -each decoded DNS label must be 63 or fewer characters. Only an application -presenting a suspicious hostname value consisting primarily of "Nothing" -characters to be removed would run into of this new limit. See :rfc:`5894` -section 6 and :rfc:`3491`. +during IDNA decoding if they are longer than 1024 unicode characters given that +each decoded DNS label must be 63 or fewer characters and the entire decoded +DNS name is limited to 255. Only an application presenting a hostname or label +consisting primarily of :rfc:`3454` section 3.1 "Nothing" characters to be +removed would run into of this new limit. See also :rfc:`5894` section 6 and +:rfc:`3491`. From 5a3b03750f57429f7529c9e0e3603c07d38cedd9 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith [Google]" Date: Mon, 7 Nov 2022 23:10:59 +0000 Subject: [PATCH 8/8] assertRaisesRegex thanks victor! --- Lib/test/test_codecs.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index edb763db6c6773..e3add0c1ee926c 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -1553,12 +1553,10 @@ def test_builtin_encode(self): self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.") def test_builtin_decode_length_limit(self): - with self.assertRaises(UnicodeError) as ctx: + with self.assertRaisesRegex(UnicodeError, "way too long"): (b"xn--016c"+b"a"*1100).decode("idna") - self.assertIn("way too long", str(ctx.exception)) - with self.assertRaises(UnicodeError) as ctx: + with self.assertRaisesRegex(UnicodeError, "too long"): (b"xn--016c"+b"a"*70).decode("idna") - self.assertIn("too long", str(ctx.exception)) def test_stream(self): r = codecs.getreader("idna")(io.BytesIO(b"abc"))