From 9c811d23da27543e9f4e3843d203355612c29b16 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Thu, 30 Apr 2020 15:03:48 -0400 Subject: [PATCH 001/174] Add making a new GitHub release to the release instructions --- README.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 720e206..f12d921 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ address information still works, so it is backwards compatible.** Installation ------------ -This package is on PyPI, so: +This package [is on PyPI](https://pypi.org/project/email-validator/), so: ```sh pip install email_validator @@ -385,14 +385,13 @@ make test For Project Maintainers ----------------------- -The package is distributed as a universal wheel. The wheel is specified -as universal in the file `setup.cfg` by the `universal = 1` key in the -`[bdist_wheel]` section. +The package is distributed as a universal wheel and as a source package. To release: * Update the version number. -* Follow the steps below to publish source and a universal wheel to pypi: +* Follow the steps below to publish source and a universal wheel to pypi. +* Make a release at https://github.com/JoshData/python-email-validator/releases/new. ```sh pip3 install twine @@ -403,3 +402,6 @@ twine upload dist/* git tag v1.0.XXX # replace with version in setup.py git push --tags ``` + +Notes: The wheel is specified as universal in the file `setup.cfg` by the `universal = 1` key in the +`[bdist_wheel]` section. \ No newline at end of file From 7a19c33e02bd5798873b22db862ecff5d7e7a34f Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Mon, 18 May 2020 07:58:41 -0400 Subject: [PATCH 002/174] Fix exception on DNS timeout In 8243bd238fbfbbef9522471ded214420e1b1fe7e, I re-organized the return value of validate_email to be an object rather than a dict. However I didn't properly handle the case when there's a DNS timeout. I think my original intention was to hide timeout errors because it's unclear if a timeout is a local problem or a problem with the email's domain name. If it's the former, passing through the timeout exception to the caller would make sense. But if it's something that can be triggered by user input, then it shouldn't raise a timeout exception, it should raise a EmailUndeliverableError. Since we can't decide which is the case, we treat it as if the deliverability check was not performed at all. Fixes #41 --- README.md | 7 +++++-- email_validator/__init__.py | 15 +++++++++++---- tests/test_main.py | 8 ++++++++ 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index f12d921..8b93ddd 100644 --- a/README.md +++ b/README.md @@ -348,12 +348,15 @@ are: `mx`: A list of (priority, domain) tuples of MX records specified in the DNS for the domain (see [RFC 5321 section - 5](https://tools.ietf.org/html/rfc5321#section-5)). + 5](https://tools.ietf.org/html/rfc5321#section-5)). May be `None` if + the deliverability check could not be completed because of a temporary + issue like a timeout. `mx_fallback_type`: `None` if an `MX` record is found. If no MX records are actually specified in DNS and instead are inferred, through an obsolete mechanism, from A or AAAA records, the value is the type of DNS - record used instead (`A` or `AAAA`). + record used instead (`A` or `AAAA`). May be `None` if the deliverability check + could not be completed because of a temporary issue like a timeout. Assumptions ----------- diff --git a/email_validator/__init__.py b/email_validator/__init__.py index e1b8fe2..61825a4 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -92,8 +92,8 @@ class ValidatedEmail(object): is False.""" smtputf8 = None - """If a deliverability check is performed, a list of (priority, domain) tuples of MX - records specified in the DNS for the domain.""" + """If a deliverability check is performed and if it succeeds, a list of (priority, domain) + tuples of MX records specified in the DNS for the domain.""" mx = None """If no MX records are actually specified in DNS and instead are inferred, through an obsolete @@ -263,8 +263,9 @@ def validate_email( # Validate the email address's deliverability and update the # return dict with metadata. deliverability_info = validate_email_deliverability(ret["domain"], ret["domain_i18n"], timeout) - ret.mx = deliverability_info["mx"] - ret.mx_fallback_type = deliverability_info["mx-fallback"] + if "mx" in deliverability_info: + ret.mx = deliverability_info["mx"] + ret.mx_fallback_type = deliverability_info["mx-fallback"] return ret @@ -441,6 +442,12 @@ def validate_email_deliverability(domain, domain_i18n, timeout=DEFAULT_TIMEOUT): domain += '.' try: + # We need a way to check how timeouts are handled in the tests. So we + # have a secret variable that if set makes this method always test the + # handling of a timeout. + if getattr(validate_email_deliverability, 'TEST_CHECK_TIMEOUT', False): + raise dns.exception.Timeout() + resolver = dns.resolver.get_default_resolver() if timeout: diff --git a/tests/test_main.py b/tests/test_main.py index 0ed1b36..fb75463 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -268,3 +268,11 @@ def test_deliverability_fails(): domain = 'xkxufoekjvjfjeodlfmdfjcu.com' with pytest.raises(EmailUndeliverableError, match='The domain name {} does not exist'.format(domain)): validate_email_deliverability(domain, domain) + +def test_deliverability_dns_timeout(): + validate_email_deliverability.TEST_CHECK_TIMEOUT = True + response = validate_email_deliverability('gmail.com', 'gmail.com') + assert "mx" not in response + assert response.get("unknown-deliverability") == "timeout" + email = validate_email('test@gmail.com') + del validate_email_deliverability.TEST_CHECK_TIMEOUT \ No newline at end of file From a846064f65ddd25833a259f663b2bfb82abebc64 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Mon, 18 May 2020 08:15:26 -0400 Subject: [PATCH 003/174] Mention Punycode in the README --- README.md | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 8b93ddd..fe773b0 100644 --- a/README.md +++ b/README.md @@ -141,8 +141,8 @@ internationalization. The first is [internationalized domain names (RFC 5891)](https://tools.ietf.org/html/rfc5891), a.k.a IDNA 2008. The DNS -system has not been updated with Unicode support. Instead, -internationalized domain names are converted into a special IDNA ASCII +system has not been updated with Unicode support. Instead, internationalized +domain names are converted into a special IDNA ASCII "[Punycode](https://www.rfc-editor.org/rfc/rfc3492.txt)" form starting with `xn--`. When an email address has non-ASCII characters in its domain part, the domain part is replaced with its IDNA ASCII equivalent form in the process of mail transmission. Your mail @@ -279,7 +279,7 @@ Note that `smtputf8` is `False` even though the domain part is internationalized because [SMTPUTF8](https://tools.ietf.org/html/rfc6531) is only needed if the local part of the address is internationalized (the domain part can be -converted to IDNA ASCII). Also note that the `email` and `domain` +converted to IDNA ASCII Punycode). Also note that the `email` and `domain` fields provide a normalized form of the email address and domain name (casefolding and Unicode normalization as required by IDNA 2008). @@ -314,7 +314,8 @@ are: fields (see below). `ascii_email`: If set, an ASCII-only form of the email address by replacing the - domain part with [IDNA ASCII](https://tools.ietf.org/html/rfc5891). + domain part with [IDNA](https://tools.ietf.org/html/rfc5891) + [Punycode](https://www.rfc-editor.org/rfc/rfc3492.txt). This field will be present when an ASCII-only form of the email address exists (including if the email address is already ASCII). If the local part of the email address contains internationalized @@ -326,16 +327,16 @@ are: `ascii_local_part`: If set, the local part, which is composed of ASCII characters only. -`domain`: The canonical internationalized form of the domain part of the - address, by round-tripping through IDNA ASCII. If the returned - string contains non-ASCII characters, either the +`domain`: The canonical internationalized Unicode form of the domain part of the + email address. If the returned string contains non-ASCII characters, either the [SMTPUTF8](https://tools.ietf.org/html/rfc6531) feature of your mail relay will be required to transmit the message or else the - email address's domain part must be converted to IDNA ASCII first - (given in the returned `domain` field). + email address's domain part must be converted to IDNA ASCII first: Use + `ascii_domain` field instead. -`ascii_domain`: The [IDNA ASCII](https://tools.ietf.org/html/rfc5891)-encoded form - of the domain part of the given email address (after the @-sign), as +`ascii_domain`: The [IDNA](https://tools.ietf.org/html/rfc5891) + [Punycode](https://www.rfc-editor.org/rfc/rfc3492.txt)-encoded + form of the domain part of the given email address, as it would be transmitted on the wire. `smtputf8`: A boolean indicating that the From 915dff452177e33ea9482cc69607021a79bb5d2a Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Mon, 18 May 2020 08:23:31 -0400 Subject: [PATCH 004/174] flake8 fixes --- Makefile | 2 +- email_validator/__init__.py | 26 +++++++++++++------------- tests/test_main.py | 5 +++-- 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/Makefile b/Makefile index a3fbd6f..ac92c99 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,7 @@ install: .PHONY: lint lint: #python setup.py check -rms - flake8 email_validator tests + flake8 --ignore=E501,E126 email_validator tests .PHONY: test test: diff --git a/email_validator/__init__.py b/email_validator/__init__.py index 61825a4..83761cd 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -136,25 +136,25 @@ def __getitem__(self, key): """Tests use this.""" def __eq__(self, other): if self.email == other.email and self.local_part == other.local_part and self.domain == other.domain \ - and self.ascii_email == other.ascii_email and self.ascii_local_part == other.ascii_local_part \ - and self.ascii_domain == other.ascii_domain \ - and self.smtputf8 == other.smtputf8 \ - and repr(sorted(self.mx) if self.mx else self.mx) == repr(sorted(other.mx) if other.mx else other.mx) \ - and self.mx_fallback_type == other.mx_fallback_type: + and self.ascii_email == other.ascii_email and self.ascii_local_part == other.ascii_local_part \ + and self.ascii_domain == other.ascii_domain \ + and self.smtputf8 == other.smtputf8 \ + and repr(sorted(self.mx) if self.mx else self.mx) == repr(sorted(other.mx) if other.mx else other.mx) \ + and self.mx_fallback_type == other.mx_fallback_type: return True return False """This helps producing the README.""" def as_constructor(self): return "ValidatedEmail(" \ - + ",".join("\n {}={}".format( - key, - repr(getattr(self, key))) - for key in ('email', 'local_part', 'domain', - 'ascii_email', 'ascii_local_part', 'ascii_domain', - 'smtputf8', 'mx', 'mx_fallback_type') - ) \ - + ")" + + ",".join("\n {}={}".format( + key, + repr(getattr(self, key))) + for key in ('email', 'local_part', 'domain', + 'ascii_email', 'ascii_local_part', 'ascii_domain', + 'smtputf8', 'mx', 'mx_fallback_type') + ) \ + + ")" def validate_email( diff --git a/tests/test_main.py b/tests/test_main.py index fb75463..7d5fe4b 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -269,10 +269,11 @@ def test_deliverability_fails(): with pytest.raises(EmailUndeliverableError, match='The domain name {} does not exist'.format(domain)): validate_email_deliverability(domain, domain) + def test_deliverability_dns_timeout(): validate_email_deliverability.TEST_CHECK_TIMEOUT = True response = validate_email_deliverability('gmail.com', 'gmail.com') assert "mx" not in response assert response.get("unknown-deliverability") == "timeout" - email = validate_email('test@gmail.com') - del validate_email_deliverability.TEST_CHECK_TIMEOUT \ No newline at end of file + validate_email('test@gmail.com') + del validate_email_deliverability.TEST_CHECK_TIMEOUT From 0954661ecdffc440768f5c49f1b07b77f0ff7b2b Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Tue, 19 May 2020 07:15:08 -0400 Subject: [PATCH 005/174] Version 1.1.1 --- README.md | 6 +++--- setup.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index fe773b0..3d9f264 100644 --- a/README.md +++ b/README.md @@ -26,8 +26,8 @@ And this library does NOT permit obsolete forms of email addresses, so if you need strict validation against the email specs exactly, use [pyIsEmail](https://github.com/michaelherold/pyIsEmail). -This library was first published in 2015. The current version is 1.1.0 -(posted April 30, 2020). **In this version, the type of the value returned +This library was first published in 2015. The current version is 1.1.1 +(posted May 19, 2020). **Starting in version 1.1.0, the type of the value returned from `validate_email` has changed, but dict-style access to the validated address information still works, so it is backwards compatible.** @@ -408,4 +408,4 @@ git push --tags ``` Notes: The wheel is specified as universal in the file `setup.cfg` by the `universal = 1` key in the -`[bdist_wheel]` section. \ No newline at end of file +`[bdist_wheel]` section. diff --git a/setup.py b/setup.py index 6cff8d4..0edfc44 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name='email_validator', - version='1.1.0', + version='1.1.1', description='A robust email syntax and deliverability validation library for Python 2.x/3.x.', long_description=open("README.md", encoding='utf-8').read(), From 9b122ad54e0179badec34579310c71bf83035fc0 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Wed, 20 May 2020 08:58:53 -0400 Subject: [PATCH 006/174] Mention Punycode normalization, re-do fields as a table --- README.md | 73 +++++++++++++++---------------------------------------- 1 file changed, 19 insertions(+), 54 deletions(-) diff --git a/README.md b/README.md index 3d9f264..930b201 100644 --- a/README.md +++ b/README.md @@ -230,13 +230,12 @@ address (domain names are case-insensitive), [Unicode "NFC" normalization](https://en.wikipedia.org/wiki/Unicode_equivalence) of the whole address (which turns characters plus [combining characters](https://en.wikipedia.org/wiki/Combining_character) into -precomposed characters where possible and replaces certain Unicode -characters (such as angstrom and ohm) with other equivalent code points -(a-with-ring and omega, respectively)), replacement of [fullwidth and +precomposed characters where possible, replacement of [fullwidth and halfwidth characters](https://en.wikipedia.org/wiki/Halfwidth_and_fullwidth_forms) -in the domain part, and possibly other -[UTS46](http://unicode.org/reports/tr46) mappings on the domain part. +in the domain part, possibly other +[UTS46](http://unicode.org/reports/tr46) mappings on the domain part, +and conversion from Punycode to Unicode characters. (See [RFC 6532 (internationalized email) section 3.1](https://tools.ietf.org/html/rfc6532#section-3.1) and [RFC 5895 @@ -283,6 +282,10 @@ converted to IDNA ASCII Punycode). Also note that the `email` and `domain` fields provide a normalized form of the email address and domain name (casefolding and Unicode normalization as required by IDNA 2008). +Calling `validate_email` with the ASCII form of the above email address, +`example@xn--bdk.life`, returns the exact same information (i.e., the +`email` field always will contain Unicode characters, not Punycode). + For the fictitious address `ツ-test@joshdata.me`, which has an internationalized local part, the returned object is: @@ -309,55 +312,17 @@ Return value When an email address passes validation, the fields in the returned object are: -`email`: The canonical form of the email address, mostly useful for - display purposes. This merely combines the `local_part` and `domain` - fields (see below). - -`ascii_email`: If set, an ASCII-only form of the email address by replacing the - domain part with [IDNA](https://tools.ietf.org/html/rfc5891) - [Punycode](https://www.rfc-editor.org/rfc/rfc3492.txt). - This field will be present when an ASCII-only form of the email - address exists (including if the email address is already ASCII). If - the local part of the email address contains internationalized - characters, `ascii_email` will be `None`. If set, it merely combines - `ascii_local_part` and `ascii_domain`. - -`local_part`: The local part of the given email address (before the @-sign) with - Unicode NFC normalization applied. - -`ascii_local_part`: If set, the local part, which is composed of ASCII characters only. - -`domain`: The canonical internationalized Unicode form of the domain part of the - email address. If the returned string contains non-ASCII characters, either the - [SMTPUTF8](https://tools.ietf.org/html/rfc6531) feature of your - mail relay will be required to transmit the message or else the - email address's domain part must be converted to IDNA ASCII first: Use - `ascii_domain` field instead. - -`ascii_domain`: The [IDNA](https://tools.ietf.org/html/rfc5891) - [Punycode](https://www.rfc-editor.org/rfc/rfc3492.txt)-encoded - form of the domain part of the given email address, as - it would be transmitted on the wire. - -`smtputf8`: A boolean indicating that the - [SMTPUTF8](https://tools.ietf.org/html/rfc6531) feature of your - mail relay will be required to transmit messages to this address - because the local part of the address has non-ASCII characters (the - local part cannot be IDNA-encoded). If `allow_smtputf8=False` is - passed as an argument, this flag will always be false because an - exception is raised if it would have been true. - -`mx`: A list of (priority, domain) tuples of MX records specified in the - DNS for the domain (see [RFC 5321 section - 5](https://tools.ietf.org/html/rfc5321#section-5)). May be `None` if - the deliverability check could not be completed because of a temporary - issue like a timeout. - -`mx_fallback_type`: `None` if an `MX` record is found. If no MX records are actually - specified in DNS and instead are inferred, through an obsolete - mechanism, from A or AAAA records, the value is the type of DNS - record used instead (`A` or `AAAA`). May be `None` if the deliverability check - could not be completed because of a temporary issue like a timeout. +| Field | Value | +| -----:|-------| +| `email` | The normalized form of the email address that you should put in your database. This merely combines the `local_part` and `domain` fields (see below). | +| `ascii_email` | If set, an ASCII-only form of the email address by replacing the domain part with [IDNA](https://tools.ietf.org/html/rfc5891) [Punycode](https://www.rfc-editor.org/rfc/rfc3492.txt). This field will be present when an ASCII-only form of the email address exists (including if the email address is already ASCII). If the local part of the email address contains internationalized characters, `ascii_email` will be `None`. If set, it merely combines `ascii_local_part` and `ascii_domain`. | +| `local_part` | The local part of the given email address (before the @-sign) with Unicode NFC normalization applied. | +| `ascii_local_part` | If set, the local part, which is composed of ASCII characters only. | +| `domain` | The canonical internationalized Unicode form of the domain part of the email address. If the returned string contains non-ASCII characters, either the [SMTPUTF8](https://tools.ietf.org/html/rfc6531) feature of your mail relay will be required to transmit the message or else the email address's domain part must be converted to IDNA ASCII first: Use `ascii_domain` field instead. | +| `ascii_domain` | The [IDNA](https://tools.ietf.org/html/rfc5891) [Punycode](https://www.rfc-editor.org/rfc/rfc3492.txt)-encoded form of the domain part of the given email address, as it would be transmitted on the wire. | +| `smtputf8` | A boolean indicating that the [SMTPUTF8](https://tools.ietf.org/html/rfc6531) feature of your mail relay will be required to transmit messages to this address because the local part of the address has non-ASCII characters (the local part cannot be IDNA-encoded). If `allow_smtputf8=False` is passed as an argument, this flag will always be false because an exception is raised if it would have been true. | +| `mx` | A list of (priority, domain) tuples of MX records specified in the DNS for the domain (see [RFC 5321 section 5](https://tools.ietf.org/html/rfc5321#section-5)). May be `None` if the deliverability check could not be completed because of a temporary issue like a timeout. | +| `mx_fallback_type` | `None` if an `MX` record is found. If no MX records are actually specified in DNS and instead are inferred, through an obsolete mechanism, from A or AAAA records, the value is the type of DNS record used instead (`A` or `AAAA`). May be `None` if the deliverability check could not be completed because of a temporary issue like a timeout. | Assumptions ----------- From cc9b0019fc9d4cedb491fd760e092572858d4604 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Thu, 16 Jul 2020 17:16:06 -0400 Subject: [PATCH 007/174] Package name should have a dash not an underscore --- README.md | 15 ++++++++------- setup.py | 2 +- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 930b201..2ef35de 100644 --- a/README.md +++ b/README.md @@ -1,17 +1,18 @@ -email\_validator -================ +email-validator: Validate Email Addresses +========================================= A robust email address syntax and deliverability validation library for -Python 2.7/3.4+ by [Joshua Tauberer](https://razor.occams.info). +Python 2.7/3.4+ by [Joshua Tauberer](https://joshdata.me). -This library validates that a string is of the form `x@y.com`. This is +This library validates that a string is of the form `name@example.com`. This is the sort of validation you would want for an email-based login form on a website. Key features: -* Good for validating email addresses used for logins/identity. -* Friendly error messages when validation fails (appropriate to show +* Checks that an email address has the correct syntax --- good for + login forms or other uses related to identifying users. +* Gives friendly error messages when validation fails (appropriate to show to end users). * (optionally) Checks deliverability: Does the domain name resolve? * Supports internationalized domain names and (optionally) @@ -37,7 +38,7 @@ Installation This package [is on PyPI](https://pypi.org/project/email-validator/), so: ```sh -pip install email_validator +pip install email-validator ``` `pip3` also works. diff --git a/setup.py b/setup.py index 0edfc44..71006e4 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ from codecs import open setup( - name='email_validator', + name='email-validator', version='1.1.1', description='A robust email syntax and deliverability validation library for Python 2.x/3.x.', From 458c9c49422eeac92903e4e7d2865b1256b361d6 Mon Sep 17 00:00:00 2001 From: dade <0xdade@users.noreply.github.com> Date: Mon, 3 Aug 2020 03:52:11 -0700 Subject: [PATCH 008/174] Use dnspython's resolve method when available (#46) Closes #45 by defaulting to resolver.resolve and only falling back to resolver.query when resolver.resolve isn't present. Co-authored-by: Joshua Tauberer --- email_validator/__init__.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/email_validator/__init__.py b/email_validator/__init__.py index 83761cd..3b7014f 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -438,8 +438,20 @@ def validate_email_deliverability(domain, domain_i18n, timeout=DEFAULT_TIMEOUT): # Check that the domain resolves to an MX record. If there is no MX record, # try an A or AAAA record which is a deprecated fallback for deliverability. - # Add a trailing period to ensure the domain name is treated as fully qualified. - domain += '.' + def dns_resolver_resolve_shim(resolver, domain, record): + try: + # dns.resolver.Resolver.resolve is new to dnspython 2.x. + # https://dnspython.readthedocs.io/en/latest/resolver-class.html#dns.resolver.Resolver.resolve + return resolver.resolve(domain, record) + except AttributeError: + # dnspython 2.x is only available in Python 3.6 and later. For earlier versions + # of Python, we maintain compatibility with dnspython 1.x which has a + # dnspython.resolver.Resolver.query method instead. The only difference is that + # query may treat the domain as relative and use the system's search domains, + # which we prevent by adding a "." to the domain name to make it absolute. + # dns.resolver.Resolver.query is deprecated in dnspython version 2.x. + # https://dnspython.readthedocs.io/en/latest/resolver-class.html#dns.resolver.Resolver.query + return resolver.query(domain + ".", record) try: # We need a way to check how timeouts are handled in the tests. So we @@ -455,21 +467,21 @@ def validate_email_deliverability(domain, domain_i18n, timeout=DEFAULT_TIMEOUT): try: # Try resolving for MX records and get them in sorted priority order. - response = dns.resolver.query(domain, "MX") + response = dns_resolver_resolve_shim(resolver, domain, "MX") mtas = sorted([(r.preference, str(r.exchange).rstrip('.')) for r in response]) mx_fallback = None except (dns.resolver.NoNameservers, dns.resolver.NXDOMAIN, dns.resolver.NoAnswer): # If there was no MX record, fall back to an A record. try: - response = dns.resolver.query(domain, "A") + response = dns_resolver_resolve_shim(resolver, domain, "A") mtas = [(0, str(r)) for r in response] mx_fallback = "A" except (dns.resolver.NoNameservers, dns.resolver.NXDOMAIN, dns.resolver.NoAnswer): # If there was no A record, fall back to an AAAA record. try: - response = dns.resolver.query(domain, "AAAA") + response = dns_resolver_resolve_shim(resolver, domain, "AAAA") mtas = [(0, str(r)) for r in response] mx_fallback = "AAAA" except (dns.resolver.NoNameservers, dns.resolver.NXDOMAIN, dns.resolver.NoAnswer): From 48840d1b95823baa959a45ff1c0f8586bb234833 Mon Sep 17 00:00:00 2001 From: dade <0xdade@users.noreply.github.com> Date: Fri, 7 Aug 2020 14:22:37 -0700 Subject: [PATCH 009/174] Fix: ValidatedEmail is not JSON serializable (#49) Closes #47 by implementing a simple as_dict() method for ValidatedEmail that can be called when a dict serializable object is desired --- email_validator/__init__.py | 6 +++++- tests/test_main.py | 7 +++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/email_validator/__init__.py b/email_validator/__init__.py index 3b7014f..40140ca 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -156,6 +156,10 @@ def as_constructor(self): ) \ + ")" + """Convenience method for accessing ValidatedEmail as a dict""" + def as_dict(self): + return self.__dict__ + def validate_email( email, @@ -536,7 +540,7 @@ def main(): email = email.decode("utf8") # assume utf8 in input try: result = validate_email(email, allow_smtputf8=allow_smtputf8, check_deliverability=check_deliverability) - print(json.dumps(result, indent=2, sort_keys=True, ensure_ascii=False)) + print(json.dumps(result.as_dict(), indent=2, sort_keys=True, ensure_ascii=False)) except EmailNotValidError as e: if sys.version_info < (3,): print(unicode_class(e).encode("utf8")) diff --git a/tests/test_main.py b/tests/test_main.py index 7d5fe4b..d0c627a 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -250,6 +250,13 @@ def test_email_invalid(email_input, error_msg): assert str(exc_info.value) == error_msg +def test_dict_accessor(): + input_email = "testaddr@example.com" + valid_email = validate_email(input_email, check_deliverability=False) + assert isinstance(valid_email.as_dict(), dict) + assert valid_email.as_dict()["original_email"] == input_email + + def test_deliverability_no_records(): assert validate_email_deliverability('example.com', 'example.com') == {'mx': [(0, '')], 'mx-fallback': None} From 7428eeb8f0ef0cc57f9de3ccab0ee9c792153fd9 Mon Sep 17 00:00:00 2001 From: dade <0xdade@users.noreply.github.com> Date: Fri, 7 Aug 2020 14:30:35 -0700 Subject: [PATCH 010/174] Dedupe length reason logic and declare magic numbers as constants (#50) --- email_validator/__init__.py | 59 +++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 29 deletions(-) diff --git a/email_validator/__init__.py b/email_validator/__init__.py index 40140ca..4647d87 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -29,6 +29,13 @@ # the beginning or end of a *dot-atom component* of a hostname either. ATEXT_HOSTNAME = r'(?:(?:[a-zA-Z0-9][a-zA-Z0-9\-]*)?[a-zA-Z0-9])' +# Length constants +# RFC 3696 + errata 1003 + errata 1690 (https://www.rfc-editor.org/errata_search.php?rfc=3696&eid=1690) +# explains the maximum length of an email address is 254 octets. +EMAIL_MAX_LENGTH = 254 +LOCAL_PART_MAX_LENGTH = 64 +DOMAIN_MAX_LENGTH = 255 + # ease compatibility in type checking if sys.version_info >= (3,): unicode_class = str @@ -161,6 +168,14 @@ def as_dict(self): return self.__dict__ +def __get_length_reason(addr, utf8=False, limit=EMAIL_MAX_LENGTH): + diff = len(addr) - limit + reason = "({}{} character{} too many)" + prefix = "at least " if utf8 else "" + suffix = "s" if diff > 1 else "" + return reason.format(prefix, diff, suffix) + + def validate_email( email, allow_smtputf8=True, @@ -212,9 +227,6 @@ def validate_email( if not ret.smtputf8: ret.ascii_email = ret.ascii_local_part + "@" + ret.ascii_domain - # RFC 3696 + errata 1003 + errata 1690 (https://www.rfc-editor.org/errata_search.php?rfc=3696&eid=1690) - # explains the maximum length of an email address is 254 octets. - # # If the email address has an ASCII representation, then we assume it may be # transmitted in ASCII (we can't assume SMTPUTF8 will be used on all hops to # the destination) and the length limit applies to ASCII characters (which is @@ -235,33 +247,24 @@ def validate_email( # longer than the number of characters. # # See the length checks on the local part and the domain. - if ret.ascii_email and len(ret.ascii_email) > 254: + if ret.ascii_email and len(ret.ascii_email) > EMAIL_MAX_LENGTH: if ret.ascii_email == ret.email: - reason = " ({} character{} too many)".format( - len(ret.ascii_email) - 254, - "s" if (len(ret.ascii_email) - 254 != 1) else "" - ) - elif len(ret.email) > 254: + reason = __get_length_reason(ret.ascii_email) + elif len(ret.email) > EMAIL_MAX_LENGTH: # If there are more than 254 characters, then the ASCII # form is definitely going to be too long. - reason = " (at least {} character{} too many)".format( - len(ret.email) - 254, - "s" if (len(ret.email) - 254 != 1) else "" - ) + reason = __get_length_reason(ret.email, utf8=True) else: - reason = " (when converted to IDNA ASCII)" - raise EmailSyntaxError("The email address is too long{}.".format(reason)) - if len(ret.email.encode("utf8")) > 254: - if len(ret.email) > 254: + reason = "(when converted to IDNA ASCII)" + raise EmailSyntaxError("The email address is too long {}.".format(reason)) + if len(ret.email.encode("utf8")) > EMAIL_MAX_LENGTH: + if len(ret.email) > EMAIL_MAX_LENGTH: # If there are more than 254 characters, then the UTF-8 # encoding is definitely going to be too long. - reason = " (at least {} character{} too many)".format( - len(ret.email) - 254, - "s" if (len(ret.email) - 254 != 1) else "" - ) + reason = __get_length_reason(ret.email, utf8=True) else: - reason = " (when encoded in bytes)" - raise EmailSyntaxError("The email address is too long{}.".format(reason)) + reason = "(when encoded in bytes)" + raise EmailSyntaxError("The email address is too long {}.".format(reason)) if check_deliverability: # Validate the email address's deliverability and update the @@ -295,11 +298,9 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals # internationalized, then the UTF-8 encoding may be longer, but # that may not be relevant. We will check the total address length # instead. - if len(local) > 64: - raise EmailSyntaxError("The email address is too long before the @-sign ({} character{} too many).".format( - len(local) - 64, - "s" if (len(local) - 64 != 1) else "" - )) + if len(local) > LOCAL_PART_MAX_LENGTH: + reason = __get_length_reason(local, limit=LOCAL_PART_MAX_LENGTH) + raise EmailSyntaxError("The email address is too long before the @-sign {}.".format(reason)) # Check the local part against the regular expression for the older ASCII requirements. m = re.match(DOT_ATOM_TEXT + "\\Z", local) @@ -404,7 +405,7 @@ def validate_email_domain_part(domain): # on the assumption that the domain may be transmitted without SMTPUTF8 # as IDNA ASCII. This is also checked by idna.encode, so this exception # is never reached. - if len(ascii_domain) > 255: + if len(ascii_domain) > DOMAIN_MAX_LENGTH: raise EmailSyntaxError("The email address is too long after the @-sign.") # A "dot atom text", per RFC 2822 3.2.4, but using the restricted From 893aae5735eecf37a616f3ebed8144c2928e655a Mon Sep 17 00:00:00 2001 From: dade <0xdade@users.noreply.github.com> Date: Fri, 7 Aug 2020 14:31:08 -0700 Subject: [PATCH 011/174] Simplify email equality check into return statement (#51) --- Makefile | 2 +- email_validator/__init__.py | 20 ++++++++++++-------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index ac92c99..71f8600 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,7 @@ install: .PHONY: lint lint: #python setup.py check -rms - flake8 --ignore=E501,E126 email_validator tests + flake8 --ignore=E501,E126,W503 email_validator tests .PHONY: test test: diff --git a/email_validator/__init__.py b/email_validator/__init__.py index 4647d87..4d720ae 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -142,14 +142,18 @@ def __getitem__(self, key): """Tests use this.""" def __eq__(self, other): - if self.email == other.email and self.local_part == other.local_part and self.domain == other.domain \ - and self.ascii_email == other.ascii_email and self.ascii_local_part == other.ascii_local_part \ - and self.ascii_domain == other.ascii_domain \ - and self.smtputf8 == other.smtputf8 \ - and repr(sorted(self.mx) if self.mx else self.mx) == repr(sorted(other.mx) if other.mx else other.mx) \ - and self.mx_fallback_type == other.mx_fallback_type: - return True - return False + return ( + self.email == other.email + and self.local_part == other.local_part + and self.domain == other.domain + and self.ascii_email == other.ascii_email + and self.ascii_local_part == other.ascii_local_part + and self.ascii_domain == other.ascii_domain + and self.smtputf8 == other.smtputf8 + and repr(sorted(self.mx) if self.mx else self.mx) + == repr(sorted(other.mx) if other.mx else other.mx) + and self.mx_fallback_type == other.mx_fallback_type + ) """This helps producing the README.""" def as_constructor(self): From fd76e666e12013c34c35731d0bef006ed0455e34 Mon Sep 17 00:00:00 2001 From: dade <0xdade@users.noreply.github.com> Date: Tue, 25 Aug 2020 04:17:58 -0700 Subject: [PATCH 012/174] Refactor: Main refactored, tests added for main (#52) --- email_validator/__init__.py | 33 ++++++++++---------- tests/test_main.py | 60 +++++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 17 deletions(-) diff --git a/email_validator/__init__.py b/email_validator/__init__.py index 4d720ae..ded7899 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -525,32 +525,31 @@ def main(): import sys import json + def __utf8_input_shim(input_str): + if sys.version_info < (3,): + return input_str.decode("utf-8") + return input_str + + def __utf8_output_shim(output_str): + if sys.version_info < (3,): + return unicode_class(output_str).encode("utf-8") + return output_str + if len(sys.argv) == 1: - # Read lines for STDIN and validate the email address on each line. - allow_smtputf8 = True for line in sys.stdin: + email = __utf8_input_shim(line.strip()) try: - email = line.strip() - if sys.version_info < (3,): - email = email.decode("utf8") # assume utf8 in input - validate_email(email, allow_smtputf8=allow_smtputf8) + validate_email(email) except EmailNotValidError as e: - print(email, e) + print(__utf8_output_shim("{} {}".format(email, e))) else: # Validate the email address passed on the command line. - email = sys.argv[1] - allow_smtputf8 = True - check_deliverability = True - if sys.version_info < (3,): - email = email.decode("utf8") # assume utf8 in input + email = __utf8_input_shim(sys.argv[1]) try: - result = validate_email(email, allow_smtputf8=allow_smtputf8, check_deliverability=check_deliverability) + result = validate_email(email) print(json.dumps(result.as_dict(), indent=2, sort_keys=True, ensure_ascii=False)) except EmailNotValidError as e: - if sys.version_info < (3,): - print(unicode_class(e).encode("utf8")) - else: - print(e) + print(__utf8_output_shim(e)) if __name__ == "__main__": diff --git a/tests/test_main.py b/tests/test_main.py index d0c627a..af975ba 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -2,6 +2,8 @@ from email_validator import EmailSyntaxError, EmailUndeliverableError, \ validate_email, validate_email_deliverability, \ ValidatedEmail +# Let's test main but rename it to be clear +from email_validator import main as validator_main @pytest.mark.parametrize( @@ -284,3 +286,61 @@ def test_deliverability_dns_timeout(): assert response.get("unknown-deliverability") == "timeout" validate_email('test@gmail.com') del validate_email_deliverability.TEST_CHECK_TIMEOUT + + +def test_main_single_good_input(monkeypatch, capsys): + import json + test_email = "test@example.com" + monkeypatch.setattr('sys.argv', ['email_validator', test_email]) + validator_main() + stdout, _ = capsys.readouterr() + output = json.loads(str(stdout)) + assert isinstance(output, dict) + assert validate_email(test_email).original_email == output["original_email"] + + +def test_main_single_bad_input(monkeypatch, capsys): + bad_email = 'test@..com' + monkeypatch.setattr('sys.argv', ['email_validator', bad_email]) + validator_main() + stdout, _ = capsys.readouterr() + assert stdout == 'An email address cannot have a period immediately after the @-sign.\n' + + +def test_main_multi_input(monkeypatch, capsys): + import io + test_cases = ["test@example.com", "test2@example.com", "test@.com", "test3@.com"] + test_input = io.StringIO("\n".join(test_cases)) + monkeypatch.setattr('sys.stdin', test_input) + monkeypatch.setattr('sys.argv', ['email_validator']) + validator_main() + stdout, _ = capsys.readouterr() + assert test_cases[0] not in stdout + assert test_cases[1] not in stdout + assert test_cases[2] in stdout + assert test_cases[3] in stdout + + +def test_main_input_shim(monkeypatch, capsys): + import json + monkeypatch.setattr('sys.version_info', (2, 7)) + test_email = b"test@example.com" + monkeypatch.setattr('sys.argv', ['email_validator', test_email]) + validator_main() + stdout, _ = capsys.readouterr() + output = json.loads(str(stdout)) + assert isinstance(output, dict) + assert validate_email(test_email).original_email == output["original_email"] + + +def test_main_output_shim(monkeypatch, capsys): + monkeypatch.setattr('sys.version_info', (2, 7)) + test_email = b"test@.com" + monkeypatch.setattr('sys.argv', ['email_validator', test_email]) + validator_main() + stdout, _ = capsys.readouterr() + + # This looks bad but it has to do with the way python 2.7 prints vs py3 + # The \n is part of the print statement, not part of the string, which is what the b'...' is + # Since we're mocking py 2.7 here instead of actually using 2.7, this was the closest I could get + assert stdout == "b'An email address cannot have a period immediately after the @-sign.'\n" From df5dbf9f83de4dea72404fda5f68049ca20fce98 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Thu, 5 Nov 2020 16:44:00 -0500 Subject: [PATCH 013/174] Version 1.1.2 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 71006e4..28dae7f 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name='email-validator', - version='1.1.1', + version='1.1.2', description='A robust email syntax and deliverability validation library for Python 2.x/3.x.', long_description=open("README.md", encoding='utf-8').read(), From ad53fb4166a10a09b76b4a9861a305017a98f5fc Mon Sep 17 00:00:00 2001 From: Jonas Kittner <54631600+theendlessriver13@users.noreply.github.com> Date: Thu, 12 Nov 2020 00:56:29 +0100 Subject: [PATCH 014/174] add py39 and setup_py to setup_cfg (#57) --- .travis.yml | 1 + setup.cfg | 40 +++++++++++++++++++++++++++++++++++++--- setup.py | 51 ++------------------------------------------------- 3 files changed, 40 insertions(+), 52 deletions(-) diff --git a/.travis.yml b/.travis.yml index 74b263e..100a5ed 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,6 +10,7 @@ python: - '3.6' - '3.7' - '3.8' +- '3.9' install: - make install diff --git a/setup.cfg b/setup.cfg index f6ab14a..fd4cfdc 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,8 +1,42 @@ -[bdist_wheel] -universal = 1 - [metadata] +name = email_validator +version = 1.1.2 +description = A robust email syntax and deliverability validation library for Python 2.x/3.x. +long_description = file: README.md +long_description_content_type = text/markdown +url = https://github.com/JoshData/python-email-validator +author = Joshua Tauberer +author_email = jt@occams.info +license = CC0 (copyright waived) license_file = LICENSE +classifiers = + Development Status :: 5 - Production/Stable + Intended Audience :: Developers + License :: CC0 1.0 Universal (CC0 1.0) Public Domain Dedication + Programming Language :: Python :: 2 + Programming Language :: Python :: 2.7 + Programming Language :: Python :: 3 + Programming Language :: Python :: 3.5 + Programming Language :: Python :: 3.6 + Programming Language :: Python :: 3.7 + Programming Language :: Python :: 3.8 + Programming Language :: Python :: 3.9 + Topic :: Software Development :: Libraries :: Python Modules +keywords = email address validator + +[options] +packages = find: +install_requires = + dnspython>=1.15.0 + idna>=2.0.0 +python_requires = >=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.* + +[options.entry_points] +console_scripts = + email_validator=email_validator:main + +[bdist_wheel] +universal = 1 [flake8] max-line-length = 120 diff --git a/setup.py b/setup.py index 28dae7f..8bf1ba9 100644 --- a/setup.py +++ b/setup.py @@ -1,49 +1,2 @@ -# -*- coding: utf-8 -*- - -from setuptools import setup, find_packages -from codecs import open - -setup( - name='email-validator', - version='1.1.2', - - description='A robust email syntax and deliverability validation library for Python 2.x/3.x.', - long_description=open("README.md", encoding='utf-8').read(), - long_description_content_type="text/markdown", - url='https://github.com/JoshData/python-email-validator', - - author=u'Joshua Tauberer', - author_email=u'jt@occams.info', - license='CC0 (copyright waived)', - - # See https://pypi.org/pypi?%3Aaction=list_classifiers - classifiers=[ - 'Development Status :: 5 - Production/Stable', - 'License :: CC0 1.0 Universal (CC0 1.0) Public Domain Dedication', - - 'Intended Audience :: Developers', - 'Topic :: Software Development :: Libraries :: Python Modules', - - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.4', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - ], - - keywords="email address validator", - - packages=find_packages(), - install_requires=[ - "idna>=2.0.0", - "dnspython>=1.15.0"], - - entry_points={ - 'console_scripts': [ - 'email_validator=email_validator:main', - ], - }, -) +from setuptools import setup +setup() From 1431de05ec70494b1d0ed4d1d3cdb6662fe755df Mon Sep 17 00:00:00 2001 From: Hugo Heyman Date: Sun, 15 Nov 2020 22:58:27 +0100 Subject: [PATCH 015/174] Add possibility to cache dns lookups (#58) Add optional argument dns_resolver to validate_email. If provided it will be used instead of the default resolver. The provided resolver can have a configured cache and custom timeout. Co-authored-by: Joshua Tauberer --- README.md | 29 +++++++++++++++++----------- email_validator/__init__.py | 38 ++++++++++++++++++++++++------------- tests/test_main.py | 28 ++++++++++++++++++++++++++- 3 files changed, 70 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 2ef35de..929b525 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ Key features: login forms or other uses related to identifying users. * Gives friendly error messages when validation fails (appropriate to show to end users). -* (optionally) Checks deliverability: Does the domain name resolve? +* (optionally) Checks deliverability: Does the domain name resolve? And you can override the default DNS resolver. * Supports internationalized domain names and (optionally) internationalized local parts. * Normalizes email addresses (super important for internationalized @@ -69,23 +69,27 @@ This validates the address and gives you its normalized form. You should put the normalized form in your database and always normalize before checking if an address is in your database. -The validator will accept internationalized email addresses, but email -addresses with non-ASCII characters in the *local* part of the address -(before the @-sign) require the -[SMTPUTF8](https://tools.ietf.org/html/rfc6531) extension which may not -be supported by your mail submission library or your outbound mail -server. If you know ahead of time that SMTPUTF8 is not supported then -**add the keyword argument allow\_smtputf8=False to fail validation for -addresses that would require SMTPUTF8**: +When validating many email addresses or to control the timeout (the default is 15 seconds), create a caching [dns.resolver.Resolver](https://dnspython.readthedocs.io/en/latest/resolver-class.html) to reuse in each call: ```python -valid = validate_email(email, allow_smtputf8=False) +from email_validator import validate_email, caching_resolver + +resolver = caching_resolver(timeout=10) + +while True: + valid = validate_email(email, dns_resolver=resolver) ``` +The validator will accept internationalized email addresses, but not all +mail systems can send email to an addresses with non-ASCII characters in +the *local* part of the address (before the @-sign). See the `allow_smtputf8` +option below. + + Overview -------- -The module provides a single function `validate_email(email_address)` which +The module provides a function `validate_email(email_address)` which takes an email address (either a `str` or ASCII `bytes`) and: - Raises a `EmailNotValidError` with a helpful, human-readable error @@ -128,6 +132,9 @@ shown): `allow_empty_local=False`: Set to `True` to allow an empty local part (i.e. `@example.com`), e.g. for validating Postfix aliases. + +`dns_resolver=None`: Pass an instance of [dns.resolver.Resolver](https://dnspython.readthedocs.io/en/latest/resolver-class.html) to control the DNS resolver including setting a timeout and [a cache](https://dnspython.readthedocs.io/en/latest/resolver-caching.html). The `caching_resolver` function shown above is a helper function to construct a dns.resolver.Resolver with a [LRUCache](https://dnspython.readthedocs.io/en/latest/resolver-caching.html#dns.resolver.LRUCache). Reuse the same resolver instance across calls to `validate_email` to make use of the cache. + Internationalized email addresses --------------------------------- diff --git a/email_validator/__init__.py b/email_validator/__init__.py index ded7899..f960f67 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -180,12 +180,20 @@ def __get_length_reason(addr, utf8=False, limit=EMAIL_MAX_LENGTH): return reason.format(prefix, diff, suffix) +def caching_resolver(timeout=DEFAULT_TIMEOUT, cache=None): + resolver = dns.resolver.Resolver() + resolver.cache = cache or dns.resolver.LRUCache() + resolver.lifetime = timeout # timeout, in seconds + return resolver + + def validate_email( email, allow_smtputf8=True, allow_empty_local=False, check_deliverability=True, timeout=DEFAULT_TIMEOUT, + dns_resolver=None ): """ Validates an email address, raising an EmailNotValidError if the address is not valid or returning a dict of @@ -273,7 +281,9 @@ def validate_email( if check_deliverability: # Validate the email address's deliverability and update the # return dict with metadata. - deliverability_info = validate_email_deliverability(ret["domain"], ret["domain_i18n"], timeout) + deliverability_info = validate_email_deliverability( + ret["domain"], ret["domain_i18n"], timeout, dns_resolver + ) if "mx" in deliverability_info: ret.mx = deliverability_info["mx"] ret.mx_fallback_type = deliverability_info["mx-fallback"] @@ -443,15 +453,22 @@ def validate_email_domain_part(domain): } -def validate_email_deliverability(domain, domain_i18n, timeout=DEFAULT_TIMEOUT): +def validate_email_deliverability(domain, domain_i18n, timeout=DEFAULT_TIMEOUT, dns_resolver=None): # Check that the domain resolves to an MX record. If there is no MX record, # try an A or AAAA record which is a deprecated fallback for deliverability. - def dns_resolver_resolve_shim(resolver, domain, record): + # If no dns.resolver.Resolver was given, get dnspython's default resolver. + # Override the default resolver's timeout. This may affect other uses of + # dnspython in this process. + if dns_resolver is None: + dns_resolver = dns.resolver.get_default_resolver() + dns_resolver.lifetime = timeout + + def dns_resolver_resolve_shim(domain, record): try: # dns.resolver.Resolver.resolve is new to dnspython 2.x. # https://dnspython.readthedocs.io/en/latest/resolver-class.html#dns.resolver.Resolver.resolve - return resolver.resolve(domain, record) + return dns_resolver.resolve(domain, record) except AttributeError: # dnspython 2.x is only available in Python 3.6 and later. For earlier versions # of Python, we maintain compatibility with dnspython 1.x which has a @@ -460,7 +477,7 @@ def dns_resolver_resolve_shim(resolver, domain, record): # which we prevent by adding a "." to the domain name to make it absolute. # dns.resolver.Resolver.query is deprecated in dnspython version 2.x. # https://dnspython.readthedocs.io/en/latest/resolver-class.html#dns.resolver.Resolver.query - return resolver.query(domain + ".", record) + return dns_resolver.query(domain + ".", record) try: # We need a way to check how timeouts are handled in the tests. So we @@ -469,28 +486,23 @@ def dns_resolver_resolve_shim(resolver, domain, record): if getattr(validate_email_deliverability, 'TEST_CHECK_TIMEOUT', False): raise dns.exception.Timeout() - resolver = dns.resolver.get_default_resolver() - - if timeout: - resolver.lifetime = timeout - try: # Try resolving for MX records and get them in sorted priority order. - response = dns_resolver_resolve_shim(resolver, domain, "MX") + response = dns_resolver_resolve_shim(domain, "MX") mtas = sorted([(r.preference, str(r.exchange).rstrip('.')) for r in response]) mx_fallback = None except (dns.resolver.NoNameservers, dns.resolver.NXDOMAIN, dns.resolver.NoAnswer): # If there was no MX record, fall back to an A record. try: - response = dns_resolver_resolve_shim(resolver, domain, "A") + response = dns_resolver_resolve_shim(domain, "A") mtas = [(0, str(r)) for r in response] mx_fallback = "A" except (dns.resolver.NoNameservers, dns.resolver.NXDOMAIN, dns.resolver.NoAnswer): # If there was no A record, fall back to an AAAA record. try: - response = dns_resolver_resolve_shim(resolver, domain, "AAAA") + response = dns_resolver_resolve_shim(domain, "AAAA") mtas = [(0, str(r)) for r in response] mx_fallback = "AAAA" except (dns.resolver.NoNameservers, dns.resolver.NXDOMAIN, dns.resolver.NoAnswer): diff --git a/tests/test_main.py b/tests/test_main.py index af975ba..d2fd923 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -1,7 +1,9 @@ +from unittest import mock +import dns.resolver import pytest from email_validator import EmailSyntaxError, EmailUndeliverableError, \ validate_email, validate_email_deliverability, \ - ValidatedEmail + caching_resolver, ValidatedEmail # Let's test main but rename it to be clear from email_validator import main as validator_main @@ -344,3 +346,27 @@ def test_main_output_shim(monkeypatch, capsys): # The \n is part of the print statement, not part of the string, which is what the b'...' is # Since we're mocking py 2.7 here instead of actually using 2.7, this was the closest I could get assert stdout == "b'An email address cannot have a period immediately after the @-sign.'\n" + + +@mock.patch("dns.resolver.LRUCache.put") +def test_validate_email__with_caching_resolver(mocked_put): + dns_resolver = caching_resolver() + validate_email("test@gmail.com", dns_resolver=dns_resolver) + assert mocked_put.called + + with mock.patch("dns.resolver.LRUCache.get") as mocked_get: + validate_email("test@gmail.com", dns_resolver=dns_resolver) + assert mocked_get.called + + +@mock.patch("dns.resolver.LRUCache.put") +def test_validate_email__with_configured_resolver(mocked_put): + dns_resolver = dns.resolver.Resolver() + dns_resolver.lifetime = 10 + dns_resolver.cache = dns.resolver.LRUCache(max_size=1000) + validate_email("test@gmail.com", dns_resolver=dns_resolver) + assert mocked_put.called + + with mock.patch("dns.resolver.LRUCache.get") as mocked_get: + validate_email("test@gmail.com", dns_resolver=dns_resolver) + assert mocked_get.called From 701ad549ef970b6ddb1263e570777ca9eddf6174 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sat, 12 Jun 2021 07:01:42 -0400 Subject: [PATCH 016/174] 1.1.3 --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index fd4cfdc..43ce496 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = email_validator -version = 1.1.2 +version = 1.1.3 description = A robust email syntax and deliverability validation library for Python 2.x/3.x. long_description = file: README.md long_description_content_type = text/markdown From 1682bda9367bc93d2cee06486330331e35d04a30 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Wed, 1 Sep 2021 14:00:55 -0400 Subject: [PATCH 017/174] Add idna>=2.8 to test_requirements.txt so that exception messages match expected test results, fixes #60 --- test_requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/test_requirements.txt b/test_requirements.txt index e8431c4..0658825 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -3,3 +3,4 @@ docutils==0.15.2 flake8==3.7.9 pytest==5.2.2 pytest-cov==2.8.1 +idna>=2.8 From fbcf145699242784899e236097ba99ecc6dd14d5 Mon Sep 17 00:00:00 2001 From: Yasir Assam Date: Thu, 23 Sep 2021 05:06:18 +1000 Subject: [PATCH 018/174] Fix bug in ValidatedEmail method __eq__() (#66) Check parameter `other` is an instance of ValidatedEmail before checking all its members are equal to self's members. --- email_validator/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/email_validator/__init__.py b/email_validator/__init__.py index f960f67..b18ba25 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -142,6 +142,8 @@ def __getitem__(self, key): """Tests use this.""" def __eq__(self, other): + if not isinstance(other, ValidatedEmail): + return False return ( self.email == other.email and self.local_part == other.local_part From 6c53ad67ef3cbc9337a2e6e092b76061190bce08 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Wed, 17 Nov 2021 08:29:13 -0500 Subject: [PATCH 019/174] Raise EmailUndeliverableError for special use domain names and their subdomains, except @test when a new test_environment argument is passed Some of the domain names used in tests had to be revised because they went from valid to invalid, or the exception message changed. --- README.md | 6 +- email_validator/__init__.py | 58 +++++++++++++++--- tests/test_main.py | 113 +++++++++++++++++++++++------------- 3 files changed, 127 insertions(+), 50 deletions(-) diff --git a/README.md b/README.md index 929b525..c9621ce 100644 --- a/README.md +++ b/README.md @@ -135,6 +135,7 @@ shown): `dns_resolver=None`: Pass an instance of [dns.resolver.Resolver](https://dnspython.readthedocs.io/en/latest/resolver-class.html) to control the DNS resolver including setting a timeout and [a cache](https://dnspython.readthedocs.io/en/latest/resolver-caching.html). The `caching_resolver` function shown above is a helper function to construct a dns.resolver.Resolver with a [LRUCache](https://dnspython.readthedocs.io/en/latest/resolver-caching.html#dns.resolver.LRUCache). Reuse the same resolver instance across calls to `validate_email` to make use of the cache. +In non-production test environments, you may want to allow `@test` or `@mycompany.test` email addresses to be used as placeholder email addresses, which would normally not be permitted. In that case, pass `test_environment=True`. DNS-based deliverability checks will be disabled as well. Other [Special Use Domain Names](https://www.iana.org/assignments/special-use-domain-names/special-use-domain-names.xhtml) are always considered invalid and raise `EmailUndeliverableError`. Internationalized email addresses --------------------------------- @@ -340,8 +341,11 @@ strictly conform to the standards. Many email address forms are obsolete or likely to cause trouble: * The validator assumes the email address is intended to be - deliverable on the public Internet using DNS, and so the domain part + deliverable on the public Internet. The domain part of the email address must be a resolvable domain name. + [Special Use Domain Names](https://www.iana.org/assignments/special-use-domain-names/special-use-domain-names.xhtml) + and their subdomains are always considered invalid (except see + the `test_environment` parameter above). * The "quoted string" form of the local part of the email address (RFC 5321 4.1.2) is not permitted --- no one uses this anymore anyway. Quoted forms allow multiple @-signs, space characters, and other diff --git a/email_validator/__init__.py b/email_validator/__init__.py index b18ba25..cd30c3b 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -36,6 +36,25 @@ LOCAL_PART_MAX_LENGTH = 64 DOMAIN_MAX_LENGTH = 255 +# IANA Special Use Domain Names +# Last Updated 2021-09-21 +# https://www.iana.org/assignments/special-use-domain-names/special-use-domain-names.txt +# The domain names without dots would be caught by the check that the domain +# name in an email address must have a period, but this list will also catch +# subdomains of these domains, which are also reserved. +SPECIAL_USE_DOMAIN_NAMES = ( + "arpa", # consolidated from a lot of arpa subdomains, we'll assume all subdomains of arpa are actually reserved + "example", + "example.com", + "example.net", + "example.org", + "invalid", + "local", + "localhost", + "onion", + "test", # see special logic for 'test' where this is checked +) + # ease compatibility in type checking if sys.version_info >= (3,): unicode_class = str @@ -194,6 +213,7 @@ def validate_email( allow_smtputf8=True, allow_empty_local=False, check_deliverability=True, + test_environment=False, timeout=DEFAULT_TIMEOUT, dns_resolver=None ): @@ -230,7 +250,7 @@ def validate_email( ret.smtputf8 = local_part_info["smtputf8"] # Validate the email address's domain part syntax and get a normalized form. - domain_part_info = validate_email_domain_part(parts[1]) + domain_part_info = validate_email_domain_part(parts[1], test_environment=test_environment) ret.domain = domain_part_info["domain"] ret.ascii_domain = domain_part_info["ascii_domain"] @@ -280,9 +300,9 @@ def validate_email( reason = "(when encoded in bytes)" raise EmailSyntaxError("The email address is too long {}.".format(reason)) - if check_deliverability: - # Validate the email address's deliverability and update the - # return dict with metadata. + if check_deliverability and not test_environment: + # Validate the email address's deliverability using DNS + # and update the return dict with metadata. deliverability_info = validate_email_deliverability( ret["domain"], ret["domain_i18n"], timeout, dns_resolver ) @@ -356,7 +376,7 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals } -def validate_email_domain_part(domain): +def validate_email_domain_part(domain, test_environment=False): # Empty? if len(domain) == 0: raise EmailSyntaxError("There must be something after the @-sign.") @@ -435,11 +455,33 @@ def validate_email_domain_part(domain): raise EmailSyntaxError("The email address contains invalid characters after the @-sign.") # All publicly deliverable addresses have domain named with at least - # one period. We also know that all TLDs end with a letter. - if "." not in ascii_domain: + # one period, and we'll consider the lack of a period a syntax error + # since that will match people's sense of what an email address looks + # like. We'll skip this in test environments to allow '@test' email + # addresses. + if "." not in ascii_domain and not (ascii_domain == "test" and test_environment): raise EmailSyntaxError("The domain name %s is not valid. It should have a period." % domain_i18n) + + # Check special-use and reserved domain names. Raise these as + # deliverability errors since they are syntactically valid. + # Some might fail DNS-based deliverability checks, but that + # can be turned off, so we should fail them all sooner. + for d in SPECIAL_USE_DOMAIN_NAMES: + # RFC 6761 says that applications should not block use of the 'test' + # domain name, presumably because that would prevent it from being + # used for actual testing. We'll block it, except when a special + # testing flag is used, indicating that the module is being used + # in a test environment. + if d == "test" and test_environment: + continue + + if ascii_domain == d or ascii_domain.endswith("." + d): + raise EmailUndeliverableError("The domain name %s is a special-use or reserved name that cannot be used with email." % domain_i18n) + + # We also know that all TLDs currently end with a letter, and + # we'll consider that a non-DNS based deliverability check. if not re.search(r"[A-Za-z]\Z", ascii_domain): - raise EmailSyntaxError( + raise EmailUndeliverableError( "The domain name %s is not valid. It is not within a valid top-level domain." % domain_i18n ) diff --git a/tests/test_main.py b/tests/test_main.py index d2fd923..12780bf 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -12,51 +12,51 @@ 'email_input,output', [ ( - 'Abc@example.com', + 'Abc@example.tld', ValidatedEmail( local_part='Abc', ascii_local_part='Abc', smtputf8=False, - ascii_domain='example.com', - domain='example.com', - email='Abc@example.com', - ascii_email='Abc@example.com', + ascii_domain='example.tld', + domain='example.tld', + email='Abc@example.tld', + ascii_email='Abc@example.tld', ), ), ( - 'Abc.123@example.com', + 'Abc.123@test-example.com', ValidatedEmail( local_part='Abc.123', ascii_local_part='Abc.123', smtputf8=False, - ascii_domain='example.com', - domain='example.com', - email='Abc.123@example.com', - ascii_email='Abc.123@example.com', + ascii_domain='test-example.com', + domain='test-example.com', + email='Abc.123@test-example.com', + ascii_email='Abc.123@test-example.com', ), ), ( - 'user+mailbox/department=shipping@example.com', + 'user+mailbox/department=shipping@example.tld', ValidatedEmail( local_part='user+mailbox/department=shipping', ascii_local_part='user+mailbox/department=shipping', smtputf8=False, - ascii_domain='example.com', - domain='example.com', - email='user+mailbox/department=shipping@example.com', - ascii_email='user+mailbox/department=shipping@example.com', + ascii_domain='example.tld', + domain='example.tld', + email='user+mailbox/department=shipping@example.tld', + ascii_email='user+mailbox/department=shipping@example.tld', ), ), ( - "!#$%&'*+-/=?^_`.{|}~@example.com", + "!#$%&'*+-/=?^_`.{|}~@example.tld", ValidatedEmail( local_part="!#$%&'*+-/=?^_`.{|}~", ascii_local_part="!#$%&'*+-/=?^_`.{|}~", smtputf8=False, - ascii_domain='example.com', - domain='example.com', - email="!#$%&'*+-/=?^_`.{|}~@example.com", - ascii_email="!#$%&'*+-/=?^_`.{|}~@example.com", + ascii_domain='example.tld', + domain='example.tld', + email="!#$%&'*+-/=?^_`.{|}~@example.tld", + ascii_email="!#$%&'*+-/=?^_`.{|}~@example.tld", ), ), ( @@ -142,43 +142,43 @@ ), ), ( - 'ñoñó@example.com', + 'ñoñó@example.tld', ValidatedEmail( local_part='ñoñó', smtputf8=True, - ascii_domain='example.com', - domain='example.com', - email='ñoñó@example.com', + ascii_domain='example.tld', + domain='example.tld', + email='ñoñó@example.tld', ), ), ( - '我買@example.com', + '我買@example.tld', ValidatedEmail( local_part='我買', smtputf8=True, - ascii_domain='example.com', - domain='example.com', - email='我買@example.com', + ascii_domain='example.tld', + domain='example.tld', + email='我買@example.tld', ), ), ( - '甲斐黒川日本@example.com', + '甲斐黒川日本@example.tld', ValidatedEmail( local_part='甲斐黒川日本', smtputf8=True, - ascii_domain='example.com', - domain='example.com', - email='甲斐黒川日本@example.com', + ascii_domain='example.tld', + domain='example.tld', + email='甲斐黒川日本@example.tld', ), ), ( - 'чебурашкаящик-с-апельсинами.рф@example.com', + 'чебурашкаящик-с-апельсинами.рф@example.tld', ValidatedEmail( local_part='чебурашкаящик-с-апельсинами.рф', smtputf8=True, - ascii_domain='example.com', - domain='example.com', - email='чебурашкаящик-с-апельсинами.рф@example.com', + ascii_domain='example.tld', + domain='example.tld', + email='чебурашкаящик-с-апельсинами.рф@example.tld', ), ), ( @@ -211,6 +211,7 @@ def test_email_valid(email_input, output): @pytest.mark.parametrize( 'email_input,error_msg', [ + ('my@localhost', 'The domain name localhost is not valid. It should have a period.'), ('my@.leadingdot.com', 'An email address cannot have a period immediately after the @-sign.'), ('my@..leadingfwdot.com', 'An email address cannot have a period immediately after the @-sign.'), ('my@..twodots.com', 'An email address cannot have a period immediately after the @-sign.'), @@ -247,15 +248,45 @@ def test_email_valid(email_input, output): ('my.λong.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (at least 1 character too many).'), ], ) -def test_email_invalid(email_input, error_msg): +def test_email_invalid_syntax(email_input, error_msg): + # Since these all have syntax errors, deliverability + # checks do not arise. with pytest.raises(EmailSyntaxError) as exc_info: validate_email(email_input) # print(f'({email_input!r}, {str(exc_info.value)!r}),') assert str(exc_info.value) == error_msg +@pytest.mark.parametrize( + 'email_input', + [ + ('me@anything.arpa'), + ('me@anything.example'), + ('me@example.com'), + ('me@mail.example.com'), + ('me@valid.invalid'), + ('me@link.local'), + ('me@host.localhost'), + ('me@onion.onion.onion'), + ('me@test.test.test'), + ], +) +def test_email_invalid_reserved_domain(email_input): + # Since these all fail deliverabiltiy from a static list, + # DNS deliverability checks do not arise. + with pytest.raises(EmailUndeliverableError) as exc_info: + validate_email(email_input) + # print(f'({email_input!r}, {str(exc_info.value)!r}),') + assert "is a special-use or reserved name" in str(exc_info.value) + + +def test_email_test_domain_name_in_test_environment(): + validate_email("anything@test", test_environment=True) + validate_email("anything@mycompany.test", test_environment=True) + + def test_dict_accessor(): - input_email = "testaddr@example.com" + input_email = "testaddr@example.tld" valid_email = validate_email(input_email, check_deliverability=False) assert isinstance(valid_email.as_dict(), dict) assert valid_email.as_dict()["original_email"] == input_email @@ -292,7 +323,7 @@ def test_deliverability_dns_timeout(): def test_main_single_good_input(monkeypatch, capsys): import json - test_email = "test@example.com" + test_email = "google@google.com" monkeypatch.setattr('sys.argv', ['email_validator', test_email]) validator_main() stdout, _ = capsys.readouterr() @@ -311,7 +342,7 @@ def test_main_single_bad_input(monkeypatch, capsys): def test_main_multi_input(monkeypatch, capsys): import io - test_cases = ["test@example.com", "test2@example.com", "test@.com", "test3@.com"] + test_cases = ["google1@google.com", "google2@google.com", "test@.com", "test3@.com"] test_input = io.StringIO("\n".join(test_cases)) monkeypatch.setattr('sys.stdin', test_input) monkeypatch.setattr('sys.argv', ['email_validator']) @@ -326,7 +357,7 @@ def test_main_multi_input(monkeypatch, capsys): def test_main_input_shim(monkeypatch, capsys): import json monkeypatch.setattr('sys.version_info', (2, 7)) - test_email = b"test@example.com" + test_email = b"google@google.com" monkeypatch.setattr('sys.argv', ['email_validator', test_email]) validator_main() stdout, _ = capsys.readouterr() From be65b53dd0b3b5c04f5e797cac48b55843acb3d3 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Mon, 17 Jan 2022 18:08:46 -0500 Subject: [PATCH 020/174] Make taking the normalized form more critical to the main example --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index c9621ce..e93506d 100644 --- a/README.md +++ b/README.md @@ -55,11 +55,11 @@ from email_validator import validate_email, EmailNotValidError email = "my+address@mydomain.tld" try: - # Validate. - valid = validate_email(email) - - # Update with the normalized form. - email = valid.email + # Validate & take the normalized form of the email + # address for all logic beyond this point (especially + # before going to a database query where equality + # does not take into account normalization). + email = validate_email(email).email except EmailNotValidError as e: # email is not valid, exception message is human-readable print(str(e)) From b08d0d33854293ca689c93169373ab292a98e923 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Thu, 21 Apr 2022 08:06:59 -0400 Subject: [PATCH 021/174] ASCII=>English in the README --- README.md | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index e93506d..bb8380e 100644 --- a/README.md +++ b/README.md @@ -77,11 +77,11 @@ from email_validator import validate_email, caching_resolver resolver = caching_resolver(timeout=10) while True: - valid = validate_email(email, dns_resolver=resolver) + email = validate_email(email, dns_resolver=resolver).email ``` The validator will accept internationalized email addresses, but not all -mail systems can send email to an addresses with non-ASCII characters in +mail systems can send email to an addresses with non-English characters in the *local* part of the address (before the @-sign). See the `allow_smtputf8` option below. @@ -90,12 +90,13 @@ Overview -------- The module provides a function `validate_email(email_address)` which -takes an email address (either a `str` or ASCII `bytes`) and: +takes an email address (either a `str` or `bytes`, but only non-internationalized +addresses are allowed when passing a `bytes`) and: - Raises a `EmailNotValidError` with a helpful, human-readable error message explaining why the email address is not valid, or -- Returns an object with a normalized form of the email address and - other information about it. +- Returns an object with a normalized form of the email address (which + you should use!) and other information about it. When an email address is not valid, `validate_email` raises either an `EmailSyntaxError` if the form of the address is invalid or an @@ -141,7 +142,7 @@ Internationalized email addresses --------------------------------- The email protocol SMTP and the domain name system DNS have historically -only allowed ASCII characters in email addresses and domain names, +only allowed English (ASCII) characters in email addresses and domain names, respectively. Each has adapted to internationalization in a separate way, creating two separate aspects to email address internationalization. @@ -167,11 +168,17 @@ using the [idna](https://github.com/kjd/idna) module by Kim Davies. ### Internationalized local parts The second sort of internationalization is internationalization in the -*local* part of the address (before the @-sign). These email addresses -require that your mail submission library and the mail servers along the -route to the destination, including your own outbound mail server, all -support the [SMTPUTF8 (RFC 6531)](https://tools.ietf.org/html/rfc6531) -extension. Support for SMTPUTF8 varies. +*local* part of the address (before the @-sign). In non-internationalized +email addresses, only English letters, numbers, and some punctuation +(`._!#$%&'^``*+-=~/?{|}`) are allowed. In internationalized email address +local parts, all Unicode characters are allowed by this library, although +it's possible that not all characters will be allowed by all mail systems. + +To deliver email to addresses with Unicode, non-English characters, your mail +submission library and the mail servers along the route to the destination, +including your own outbound mail server, must all support the +[SMTPUTF8 (RFC 6531)](https://tools.ietf.org/html/rfc6531) extension. +Support for SMTPUTF8 varies. See the `allow_smtputf8` parameter. ### If you know ahead of time that SMTPUTF8 is not supported by your mail submission stack From 84a24133d5393b3b9bb5f9285d252ad70e40632a Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Thu, 21 Apr 2022 08:18:56 -0400 Subject: [PATCH 022/174] Check and fail domains with null MX records mock.patch("dns.resolver.LRUCache.get") breaks the DNS check in a way that didn't fail the deliverability check before, but it does now, so the mock is replaced with something else. --- README.md | 9 ++++--- email_validator/__init__.py | 11 +++++++- tests/test_main.py | 52 ++++++++++++++++++++++--------------- 3 files changed, 46 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index bb8380e..f5712c8 100644 --- a/README.md +++ b/README.md @@ -100,7 +100,7 @@ addresses are allowed when passing a `bytes`) and: When an email address is not valid, `validate_email` raises either an `EmailSyntaxError` if the form of the address is invalid or an -`EmailUndeliverableError` if the domain name does not resolve. Both +`EmailUndeliverableError` if the domain name fails the DNS check. Both exception classes are subclasses of `EmailNotValidError`, which in turn is a subclass of `ValueError`. @@ -113,14 +113,15 @@ one uses anymore even though they are still valid and deliverable, since they will probably give you grief if you're using email for login. (See later in the document about that.) -The validator checks that the domain name in the email address resolves. +The validator checks that the domain name in the email address has a +(non-null) MX DNS record indicating that it is configured for email. There is nothing to be gained by trying to actually contact an SMTP server, so that's not done here. For privacy, security, and practicality reasons servers are good at not giving away whether an address is deliverable or not: email addresses that appear to accept mail at first can bounce mail after a delay, and bounced mail may indicate a temporary failure of a good email address (sometimes an intentional failure, like -greylisting). +greylisting). (A/AAAA-record fallback is also checked.) The function also accepts the following keyword arguments (default as shown): @@ -129,7 +130,7 @@ shown): require the [SMTPUTF8](https://tools.ietf.org/html/rfc6531) extension. -`check_deliverability=True`: Set to `False` to skip the domain name resolution check. +`check_deliverability=True`: Set to `False` to skip the domain name MX DNS record check. `allow_empty_local=False`: Set to `True` to allow an empty local part (i.e. `@example.com`), e.g. for validating Postfix aliases. diff --git a/email_validator/__init__.py b/email_validator/__init__.py index cd30c3b..40798a6 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -531,10 +531,19 @@ def dns_resolver_resolve_shim(domain, record): raise dns.exception.Timeout() try: - # Try resolving for MX records and get them in sorted priority order. + # Try resolving for MX records and get them in sorted priority order + # as (priority, qname) pairs. response = dns_resolver_resolve_shim(domain, "MX") mtas = sorted([(r.preference, str(r.exchange).rstrip('.')) for r in response]) mx_fallback = None + + # Do not permit delivery if there is only a "null MX" record (whose value is + # (0, ".") but we've stripped trailing dots, so the 'exchange' is just ""). + mtas = [(preference, exchange) for preference, exchange in mtas + if exchange != ""] + if len(mtas) == 0: + raise EmailUndeliverableError("The domain name %s does not accept email." % domain_i18n) + except (dns.resolver.NoNameservers, dns.resolver.NXDOMAIN, dns.resolver.NoAnswer): # If there was no MX record, fall back to an A record. diff --git a/tests/test_main.py b/tests/test_main.py index 12780bf..e5d23e6 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -1,4 +1,3 @@ -from unittest import mock import dns.resolver import pytest from email_validator import EmailSyntaxError, EmailUndeliverableError, \ @@ -292,10 +291,6 @@ def test_dict_accessor(): assert valid_email.as_dict()["original_email"] == input_email -def test_deliverability_no_records(): - assert validate_email_deliverability('example.com', 'example.com') == {'mx': [(0, '')], 'mx-fallback': None} - - def test_deliverability_found(): response = validate_email_deliverability('gmail.com', 'gmail.com') assert response.keys() == {'mx', 'mx-fallback'} @@ -307,10 +302,16 @@ def test_deliverability_found(): def test_deliverability_fails(): + # No MX record. domain = 'xkxufoekjvjfjeodlfmdfjcu.com' with pytest.raises(EmailUndeliverableError, match='The domain name {} does not exist'.format(domain)): validate_email_deliverability(domain, domain) + # Null MX record. + domain = 'example.com' + with pytest.raises(EmailUndeliverableError, match='The domain name {} does not accept email'.format(domain)): + validate_email_deliverability(domain, domain) + def test_deliverability_dns_timeout(): validate_email_deliverability.TEST_CHECK_TIMEOUT = True @@ -379,25 +380,34 @@ def test_main_output_shim(monkeypatch, capsys): assert stdout == "b'An email address cannot have a period immediately after the @-sign.'\n" -@mock.patch("dns.resolver.LRUCache.put") -def test_validate_email__with_caching_resolver(mocked_put): - dns_resolver = caching_resolver() - validate_email("test@gmail.com", dns_resolver=dns_resolver) - assert mocked_put.called +def test_validate_email__with_caching_resolver(): + # unittest.mock.patch("dns.resolver.LRUCache.get") doesn't + # work --- it causes get to always return an empty list. + # So we'll mock our own way. + class MockedCache: + get_called = False + put_called = False - with mock.patch("dns.resolver.LRUCache.get") as mocked_get: - validate_email("test@gmail.com", dns_resolver=dns_resolver) - assert mocked_get.called + def get(self, key): + self.get_called = True + return None + def put(self, key, value): + self.put_called = True -@mock.patch("dns.resolver.LRUCache.put") -def test_validate_email__with_configured_resolver(mocked_put): + # Test with caching_resolver helper method. + mocked_cache = MockedCache() + dns_resolver = caching_resolver(cache=mocked_cache) + validate_email("test@gmail.com", dns_resolver=dns_resolver) + assert mocked_cache.put_called + validate_email("test@gmail.com", dns_resolver=dns_resolver) + assert mocked_cache.get_called + + # Test with dns.resolver.Resolver instance. dns_resolver = dns.resolver.Resolver() dns_resolver.lifetime = 10 - dns_resolver.cache = dns.resolver.LRUCache(max_size=1000) + dns_resolver.cache = MockedCache() validate_email("test@gmail.com", dns_resolver=dns_resolver) - assert mocked_put.called - - with mock.patch("dns.resolver.LRUCache.get") as mocked_get: - validate_email("test@gmail.com", dns_resolver=dns_resolver) - assert mocked_get.called + assert mocked_cache.put_called + validate_email("test@gmail.com", dns_resolver=dns_resolver) + assert mocked_cache.get_called From 9f2195982b9b97f5061517e91516e74189d69cc6 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Thu, 21 Apr 2022 09:34:12 -0400 Subject: [PATCH 023/174] Improve safety of exception text by not repeating an unsafe input character in the message and add a new test for various unsafe characters --- email_validator/__init__.py | 2 +- tests/test_main.py | 24 ++++++++++++++++++------ 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/email_validator/__init__.py b/email_validator/__init__.py index 40798a6..8a77f8f 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -354,7 +354,7 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals if not m: # It's not a valid internationalized address either. Report which characters were not valid. bad_chars = ', '.join(sorted(set( - c for c in local if not re.match(u"[" + (ATEXT if not allow_smtputf8 else ATEXT_UTF8) + u"]", c) + unicodedata.name(c, repr(c)) for c in local if not re.match(u"[" + (ATEXT if not allow_smtputf8 else ATEXT_UTF8) + u"]", c) ))) raise EmailSyntaxError("The email address contains invalid characters before the @-sign: %s." % bad_chars) diff --git a/tests/test_main.py b/tests/test_main.py index e5d23e6..921d262 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -227,16 +227,16 @@ def test_email_valid(email_input, output): ('my@example\n.com', 'The domain name example\n.com contains invalid characters (Codepoint U+000A at position 8 of ' '\'example\\n\' not allowed).'), - ('.leadingdot@domain.com', 'The email address contains invalid characters before the @-sign: ..'), - ('..twodots@domain.com', 'The email address contains invalid characters before the @-sign: ..'), - ('twodots..here@domain.com', 'The email address contains invalid characters before the @-sign: ..'), + ('.leadingdot@domain.com', 'The email address contains invalid characters before the @-sign: FULL STOP.'), + ('..twodots@domain.com', 'The email address contains invalid characters before the @-sign: FULL STOP.'), + ('twodots..here@domain.com', 'The email address contains invalid characters before the @-sign: FULL STOP.'), ('me@⒈wouldbeinvalid.com', "The domain name ⒈wouldbeinvalid.com contains invalid characters (Codepoint U+2488 not allowed " "at position 1 in '⒈wouldbeinvalid.com')."), ('@example.com', 'There must be something before the @-sign.'), - ('\nmy@example.com', 'The email address contains invalid characters before the @-sign: \n.'), - ('m\ny@example.com', 'The email address contains invalid characters before the @-sign: \n.'), - ('my\n@example.com', 'The email address contains invalid characters before the @-sign: \n.'), + ('\nmy@example.com', 'The email address contains invalid characters before the @-sign: \'\\n\'.'), + ('m\ny@example.com', 'The email address contains invalid characters before the @-sign: \'\\n\'.'), + ('my\n@example.com', 'The email address contains invalid characters before the @-sign: \'\\n\'.'), ('11111111112222222222333333333344444444445555555555666666666677777@example.com', 'The email address is too long before the @-sign (1 character too many).'), ('111111111122222222223333333333444444444455555555556666666666777777@example.com', 'The email address is too long before the @-sign (2 characters too many).'), ('me@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.111111111122222222223333333333444444444455555555556.com', 'The email address is too long after the @-sign.'), @@ -278,6 +278,18 @@ def test_email_invalid_reserved_domain(email_input): # print(f'({email_input!r}, {str(exc_info.value)!r}),') assert "is a special-use or reserved name" in str(exc_info.value) +@pytest.mark.parametrize( + 'email_input', + [ + ('white space@test'), + ('\n@test'), + ], +) +def test_email_unsafe_character(email_input): + # Check for various unsafe characters: + with pytest.raises(EmailSyntaxError) as exc_info: + validate_email(email_input, test_environment=True) + assert "invalid character" in str(exc_info.value) def test_email_test_domain_name_in_test_environment(): validate_email("anything@test", test_environment=True) From df852f7e380745b5d9612c274a9de4d8cd641c79 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Thu, 21 Apr 2022 09:46:43 -0400 Subject: [PATCH 024/174] Prevent an unhandled exception encoding to UTF-8 if the input has a surrogate code point --- email_validator/__init__.py | 7 +++++++ tests/test_main.py | 1 + 2 files changed, 8 insertions(+) diff --git a/email_validator/__init__.py b/email_validator/__init__.py index 8a77f8f..c2a8879 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -368,6 +368,13 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals # so we'll return the normalized local part in the return value. local = unicodedata.normalize("NFC", local) + # Try encoding to UTF-8. Failure is possible with some characters like + # surrogate code points. + try: + local.encode("utf8") + except ValueError: + raise EmailSyntaxError("The email address contains an invalid character.") + # Flag that SMTPUTF8 will be required for deliverability. return { "local_part": local, diff --git a/tests/test_main.py b/tests/test_main.py index 921d262..3d25b5e 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -283,6 +283,7 @@ def test_email_invalid_reserved_domain(email_input): [ ('white space@test'), ('\n@test'), + ('\uD800@test'), # surrogate (Cs) ], ) def test_email_unsafe_character(email_input): From b255f9eb80b1fbe956eeba72e0b4e4fa48c7dfd8 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Thu, 21 Apr 2022 10:02:07 -0400 Subject: [PATCH 025/174] Block unsafe Unicode characters in the local part --- README.md | 28 ++++++++++++++++++++++------ email_validator/__init__.py | 29 ++++++++++++++++++++++++++++- tests/test_main.py | 10 +++++++++- 3 files changed, 59 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index f5712c8..9ea0b85 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ Key features: to end users). * (optionally) Checks deliverability: Does the domain name resolve? And you can override the default DNS resolver. * Supports internationalized domain names and (optionally) - internationalized local parts. + internationalized local parts, but blocks unsafe characters. * Normalizes email addresses (super important for internationalized addresses! see below). @@ -172,12 +172,28 @@ The second sort of internationalization is internationalization in the *local* part of the address (before the @-sign). In non-internationalized email addresses, only English letters, numbers, and some punctuation (`._!#$%&'^``*+-=~/?{|}`) are allowed. In internationalized email address -local parts, all Unicode characters are allowed by this library, although -it's possible that not all characters will be allowed by all mail systems. - -To deliver email to addresses with Unicode, non-English characters, your mail +local parts, a wider range of Unicode characters are allowed. + +A surprisingly large number of Unicode characters are not safe to display, +especially when the email address is concatenated with other text, so this +library tries to protect you by not permitting resvered, non-, private use, +formatting (which can be used to alter the display order of characters), +whitespace, and control characters, and combining characters +as the first character (so that they cannot combine with something outside +of the email address string). See https://qntm.org/safe and https://trojansource.codes/ +for relevant prior work. (Other than whitespace, these are checks that +you should be applying to nearly all user inputs in a security-sensitive +context.) + +These character checks are performed after Unicode normalization (see below), +so you are only fully protected if you replace all user-provided email addresses +with the normalized email address string returned by this library. This does not +guard against the well known problem that many Unicode characters look alike +(or are identical), which can be used to fool humans reading displayed text. + +Email addresses with these non-ASCII characters require that your mail submission library and the mail servers along the route to the destination, -including your own outbound mail server, must all support the +including your own outbound mail server, all support the [SMTPUTF8 (RFC 6531)](https://tools.ietf.org/html/rfc6531) extension. Support for SMTPUTF8 varies. See the `allow_smtputf8` parameter. diff --git a/email_validator/__init__.py b/email_validator/__init__.py index c2a8879..c0fd79d 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -368,8 +368,35 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals # so we'll return the normalized local part in the return value. local = unicodedata.normalize("NFC", local) + # Check for unsafe characters. + # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked + # by DOT_ATOM_TEXT_UTF8. + for i, c in enumerate(local): + category = unicodedata.category(c) + if category[0] in ("L", "N", "P", "S"): + # letters, numbers, punctuation, and symbols are permitted + pass + elif category[0] == "M": + # combining character in first position would combine with something + # outside of the email address if concatenated to the right, but are + # otherwise permitted + if i == 0: + raise EmailSyntaxError("The email address contains an initial invalid character (%s)." + % unicodedata.name(c, repr(c))) + elif category[0] in ("Z", "C"): + # spaces and line/paragraph characters (Z) and + # control, format, surrogate, private use, and unassigned code points (C) + raise EmailSyntaxError("The email address contains an invalid character (%s)." + % unicodedata.name(c, repr(c))) + else: + # All categories should be handled above, but in case there is something new + # in the future. + raise EmailSyntaxError("The email address contains a character (%s; category %s) that may not be safe." + % (unicodedata.name(c, repr(c)), category)) + # Try encoding to UTF-8. Failure is possible with some characters like - # surrogate code points. + # surrogate code points, but those are checked above. Still, we don't + # want to have an unhandled exception later. try: local.encode("utf8") except ValueError: diff --git a/tests/test_main.py b/tests/test_main.py index 3d25b5e..b81c4b6 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -278,12 +278,19 @@ def test_email_invalid_reserved_domain(email_input): # print(f'({email_input!r}, {str(exc_info.value)!r}),') assert "is a special-use or reserved name" in str(exc_info.value) + @pytest.mark.parametrize( 'email_input', [ ('white space@test'), ('\n@test'), - ('\uD800@test'), # surrogate (Cs) + ('\u2005@test'), # four-per-em space (Zs) + ('\u009C@test'), # string terminator (Cc) + ('\u200B@test'), # zero-width space (Cf) + ('\u202Dforward-\u202Ereversed@test'), # BIDI (Cf) + ('\uD800@test'), # surrogate (Cs) + ('\uE000@test'), # private use (Co) + ('\uFDEF@test'), # unassigned (Cn) ], ) def test_email_unsafe_character(email_input): @@ -292,6 +299,7 @@ def test_email_unsafe_character(email_input): validate_email(email_input, test_environment=True) assert "invalid character" in str(exc_info.value) + def test_email_test_domain_name_in_test_environment(): validate_email("anything@test", test_environment=True) validate_email("anything@mycompany.test", test_environment=True) From 8f2936a4811ae0ebb5298ccb4a4de13b7b28b07d Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Thu, 21 Apr 2022 10:38:26 -0400 Subject: [PATCH 026/174] Drop Python 3.7 from Travis build which is erroring for an unknown reason --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 100a5ed..529f3bc 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,7 +8,7 @@ python: #- '3.4' - '3.5' - '3.6' -- '3.7' +#- '3.7' - '3.8' - '3.9' From 4d4fa72b94a5e9bbec6dfab5681e0255b21e8f09 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sun, 24 Apr 2022 13:09:29 -0400 Subject: [PATCH 027/174] Remove redundant import, fixes #68 --- email_validator/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/email_validator/__init__.py b/email_validator/__init__.py index c0fd79d..bbefcf7 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -621,7 +621,6 @@ def dns_resolver_resolve_shim(domain, record): def main(): - import sys import json def __utf8_input_shim(input_str): From 8f4cf0040113e7422ac8d1e509b264fc5a2404b5 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sun, 24 Apr 2022 13:24:35 -0400 Subject: [PATCH 028/174] Use a caching resolver when checking email addresses in bulk on the command line --- email_validator/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/email_validator/__init__.py b/email_validator/__init__.py index bbefcf7..6049a3c 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -634,10 +634,12 @@ def __utf8_output_shim(output_str): return output_str if len(sys.argv) == 1: + # Validate the email addresses pased line-by-line on STDIN. + dns_resolver = caching_resolver() for line in sys.stdin: email = __utf8_input_shim(line.strip()) try: - validate_email(email) + validate_email(email, dns_resolver=dns_resolver) except EmailNotValidError as e: print(__utf8_output_shim("{} {}".format(email, e))) else: From 2bd672f7efd720868e2876a214295ea81411b7b4 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sun, 24 Apr 2022 13:33:59 -0400 Subject: [PATCH 029/174] Release version 1.2.0 --- README.md | 5 +++-- setup.cfg | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 9ea0b85..7de2045 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ email-validator: Validate Email Addresses ========================================= A robust email address syntax and deliverability validation library for -Python 2.7/3.4+ by [Joshua Tauberer](https://joshdata.me). +Python 3.5+ by [Joshua Tauberer](https://joshdata.me). This library validates that a string is of the form `name@example.com`. This is the sort of validation you would want for an email-based login form on @@ -228,6 +228,7 @@ change the user's login information without telling them.) ### UCS-4 support required for Python 2.7 +This library hopefully still works with Python 2.7. Note that when using Python 2.7, it is required that it was built with UCS-4 support (see [here](https://stackoverflow.com/questions/29109944/python-returns-length-of-2-for-single-unicode-character-string)); @@ -404,7 +405,7 @@ rm -rf dist python3 setup.py sdist python3 setup.py bdist_wheel twine upload dist/* -git tag v1.0.XXX # replace with version in setup.py +git tag v1.0.XXX # replace with version in setup.cfg git push --tags ``` diff --git a/setup.cfg b/setup.cfg index 43ce496..f1c11d3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,7 +1,7 @@ [metadata] name = email_validator -version = 1.1.3 -description = A robust email syntax and deliverability validation library for Python 2.x/3.x. +version = 1.2.0 +description = A robust email syntax and deliverability validation library. long_description = file: README.md long_description_content_type = text/markdown url = https://github.com/JoshData/python-email-validator From 9d7d02aa5cc04cb6ea645443992ce9916551ae6d Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Tue, 26 Apr 2022 18:04:22 -0400 Subject: [PATCH 030/174] Remove example and example.com/net/org from the special use domains list By popular demand in #78. --- email_validator/__init__.py | 64 ++++++++++++++++++++++++++++++------- tests/test_main.py | 21 ++++++++++-- 2 files changed, 71 insertions(+), 14 deletions(-) diff --git a/email_validator/__init__.py b/email_validator/__init__.py index 6049a3c..ffccac1 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -39,20 +39,66 @@ # IANA Special Use Domain Names # Last Updated 2021-09-21 # https://www.iana.org/assignments/special-use-domain-names/special-use-domain-names.txt +# # The domain names without dots would be caught by the check that the domain # name in an email address must have a period, but this list will also catch # subdomains of these domains, which are also reserved. SPECIAL_USE_DOMAIN_NAMES = ( - "arpa", # consolidated from a lot of arpa subdomains, we'll assume all subdomains of arpa are actually reserved - "example", - "example.com", - "example.net", - "example.org", + # The "arpa" entry here is consolidated from a lot of arpa subdomains + # for private address (i.e. non-routable IP addresses like 172.16.x.x) + # reverse mapping, plus some other subdomains. Although RFC 6761 says + # that application software should not treat these domains as special, + # they are private-use domains and so cannot have globally deliverable + # email addresses, which is an assumption of this library, and probably + # all of arpa is similarly special-use, so we reject it all. + "arpa", + + # RFC 6761 says applications "SHOULD NOT" treat the "example" domains + # as special, i.e. applications should accept these domains. + # + # The domain "example" alone fails our syntax validation because it + # lacks a dot (we assume no one has an email address on a TLD directly). + # "@example.com/net/org" will currently fail DNS-based deliverability + # checks because IANA publishes a NULL MX for these domains, and + # "@mail.example[.com/net/org]" and other subdomains will fail DNS- + # based deliverability checks because IANA does not publish MX or A + # DNS records for these subdomains. + # "example", # i.e. "wwww.example" + # "example.com", + # "example.net", + # "example.org", + + # RFC 6761 says that applications are permitted to treat this domain + # as special and that DNS should return an immediate negative response, + # so we also immediately reject this domain, which also follows the + # purpose of the domain. "invalid", + + # RFC 6762 says that applications "may" treat ".local" as special and + # that "name resolution APIs and libraries SHOULD recognize these names + # as special," and since ".local" has no global definition, we reject + # it, as we expect email addresses to be gloally routable. "local", + + # RFC 6761 says that applications (like this library) are permitted + # to treat "localhost" as special, and since it cannot have a globally + # deliverable email address, we reject it. "localhost", + + # RFC 7686 says "applications that do not implement the Tor protocol + # SHOULD generate an error upon the use of .onion and SHOULD NOT + # perform a DNS lookup. "onion", - "test", # see special logic for 'test' where this is checked + + # Although RFC 6761 says that application software should not treat + # these domains as special, it also warns users that the address may + # resolve differently in different systems, and therefore it cannot + # have a globally routable email address, which is an assumption of + # this library, so we reject "@test" and "@*.test" addresses, unless + # the test_environment keyword argument is given, to allow their use + # in application-level test environments. These domains will generally + # fail deliverability checks because "test" is not an actual TLD. + "test", ) # ease compatibility in type checking @@ -501,11 +547,7 @@ def validate_email_domain_part(domain, test_environment=False): # Some might fail DNS-based deliverability checks, but that # can be turned off, so we should fail them all sooner. for d in SPECIAL_USE_DOMAIN_NAMES: - # RFC 6761 says that applications should not block use of the 'test' - # domain name, presumably because that would prevent it from being - # used for actual testing. We'll block it, except when a special - # testing flag is used, indicating that the module is being used - # in a test environment. + # See the note near the definition of SPECIAL_USE_DOMAIN_NAMES. if d == "test" and test_environment: continue diff --git a/tests/test_main.py b/tests/test_main.py index b81c4b6..f1f731d 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -1,4 +1,5 @@ import dns.resolver +import re import pytest from email_validator import EmailSyntaxError, EmailUndeliverableError, \ validate_email, validate_email_deliverability, \ @@ -260,9 +261,6 @@ def test_email_invalid_syntax(email_input, error_msg): 'email_input', [ ('me@anything.arpa'), - ('me@anything.example'), - ('me@example.com'), - ('me@mail.example.com'), ('me@valid.invalid'), ('me@link.local'), ('me@host.localhost'), @@ -279,6 +277,23 @@ def test_email_invalid_reserved_domain(email_input): assert "is a special-use or reserved name" in str(exc_info.value) +@pytest.mark.parametrize( + 'email_input', + [ + ('me@mail.example'), + ('me@example.com'), + ('me@mail.example.com'), + ], +) +def test_email_example_reserved_domain(email_input): + # Since these all fail deliverabiltiy from a static list, + # DNS deliverability checks do not arise. + with pytest.raises(EmailUndeliverableError) as exc_info: + validate_email(email_input) + # print(f'({email_input!r}, {str(exc_info.value)!r}),') + assert re.match(r"The domain name [a-z\.]+ does not (accept email|exist)\.", str(exc_info.value)) is not None + + @pytest.mark.parametrize( 'email_input', [ From 91aa2603c5672e2571df56d9b9d57878bf86ead4 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Wed, 27 Apr 2022 10:57:51 -0400 Subject: [PATCH 031/174] Document how to modify SPECIAL_USE_DOMAIN_NAMES --- README.md | 59 ++++++++++++++++++++++++------------- email_validator/__init__.py | 4 +-- 2 files changed, 41 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 7de2045..0beb0aa 100644 --- a/README.md +++ b/README.md @@ -43,11 +43,11 @@ pip install email-validator `pip3` also works. -Usage ------ +Quick Start +----------- If you're validating a user's email address before creating a user -account, you might do this: +account in your application, you might do this: ```python from email_validator import validate_email, EmailNotValidError @@ -66,28 +66,18 @@ except EmailNotValidError as e: ``` This validates the address and gives you its normalized form. You should -put the normalized form in your database and always normalize before +**put the normalized form in your database** and always normalize before checking if an address is in your database. -When validating many email addresses or to control the timeout (the default is 15 seconds), create a caching [dns.resolver.Resolver](https://dnspython.readthedocs.io/en/latest/resolver-class.html) to reuse in each call: - -```python -from email_validator import validate_email, caching_resolver - -resolver = caching_resolver(timeout=10) - -while True: - email = validate_email(email, dns_resolver=resolver).email -``` - The validator will accept internationalized email addresses, but not all mail systems can send email to an addresses with non-English characters in the *local* part of the address (before the @-sign). See the `allow_smtputf8` option below. +Usage +----- -Overview --------- +### Overview The module provides a function `validate_email(email_address)` which takes an email address (either a `str` or `bytes`, but only non-internationalized @@ -123,8 +113,10 @@ can bounce mail after a delay, and bounced mail may indicate a temporary failure of a good email address (sometimes an intentional failure, like greylisting). (A/AAAA-record fallback is also checked.) -The function also accepts the following keyword arguments (default as -shown): +### Options + +The `validate_email` function also accepts the following keyword arguments +(defaults are as shown below): `allow_smtputf8=True`: Set to `False` to prohibit internationalized addresses that would require the @@ -137,7 +129,34 @@ shown): `dns_resolver=None`: Pass an instance of [dns.resolver.Resolver](https://dnspython.readthedocs.io/en/latest/resolver-class.html) to control the DNS resolver including setting a timeout and [a cache](https://dnspython.readthedocs.io/en/latest/resolver-caching.html). The `caching_resolver` function shown above is a helper function to construct a dns.resolver.Resolver with a [LRUCache](https://dnspython.readthedocs.io/en/latest/resolver-caching.html#dns.resolver.LRUCache). Reuse the same resolver instance across calls to `validate_email` to make use of the cache. -In non-production test environments, you may want to allow `@test` or `@mycompany.test` email addresses to be used as placeholder email addresses, which would normally not be permitted. In that case, pass `test_environment=True`. DNS-based deliverability checks will be disabled as well. Other [Special Use Domain Names](https://www.iana.org/assignments/special-use-domain-names/special-use-domain-names.xhtml) are always considered invalid and raise `EmailUndeliverableError`. +`test_environment=False`: DNS-based deliverability checks are disabled and `test` and `subdomain.test` domain names are permitted (see below). + +### DNS timeout and cache + +When validating many email addresses or to control the timeout (the default is 15 seconds), create a caching [dns.resolver.Resolver](https://dnspython.readthedocs.io/en/latest/resolver-class.html) to reuse in each call. The `caching_resolver` function returns one easily for you: + +```python +from email_validator import validate_email, caching_resolver + +resolver = caching_resolver(timeout=10) + +while True: + email = validate_email(email, dns_resolver=resolver).email +``` + +### Test addresses + +This library rejects email addresess that use the [Special Use Domain Names](https://www.iana.org/assignments/special-use-domain-names/special-use-domain-names.xhtml) `invalid`, `localhost`, `test`, and some others by raising `EmailUndeliverableError`. This is to protect your system from abuse: You probably don't want a user to be able to cause an email to be sent to `localhost`. However, in your non-production test environments you may want to use `@test` or `@myname.test` email addresses. There are two ways you can allow this: + +A. Add `test_environment=True` to the call to `validate_email` (see above). +B. Remove the special-use domain name that you want to use from `email_validator.SPECIAL_USE_DOMAIN_NAMES`: + +```python +import email_validator +email_validator.SPECIAL_USE_DOMAIN_NAMES.remove("test") +``` + +It is tempting to use `@example.com/net/org` in tests. These domains are reserved to IANA for use in documentation so there is no risk of accidentally emailing someone at those domains. But beware that this library will reject these domain names if DNS-based deliverability checks are not disabled because these domains do not resolve to domains that accept email. In tests, consider using your own domain name or `@test` or `@myname.test` instead. Internationalized email addresses --------------------------------- diff --git a/email_validator/__init__.py b/email_validator/__init__.py index ffccac1..3293bac 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -43,7 +43,7 @@ # The domain names without dots would be caught by the check that the domain # name in an email address must have a period, but this list will also catch # subdomains of these domains, which are also reserved. -SPECIAL_USE_DOMAIN_NAMES = ( +SPECIAL_USE_DOMAIN_NAMES = [ # The "arpa" entry here is consolidated from a lot of arpa subdomains # for private address (i.e. non-routable IP addresses like 172.16.x.x) # reverse mapping, plus some other subdomains. Although RFC 6761 says @@ -99,7 +99,7 @@ # in application-level test environments. These domains will generally # fail deliverability checks because "test" is not an actual TLD. "test", -) +] # ease compatibility in type checking if sys.version_info >= (3,): From d64b2915cd48408686d32bc30eba327f2f9086f9 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Wed, 27 Apr 2022 11:06:26 -0400 Subject: [PATCH 032/174] Create module attributes to set global default values for keyword argument options --- README.md | 13 +++++++------ email_validator/__init__.py | 14 +++++++++----- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 0beb0aa..7640db5 100644 --- a/README.md +++ b/README.md @@ -120,16 +120,16 @@ The `validate_email` function also accepts the following keyword arguments `allow_smtputf8=True`: Set to `False` to prohibit internationalized addresses that would require the - [SMTPUTF8](https://tools.ietf.org/html/rfc6531) extension. + [SMTPUTF8](https://tools.ietf.org/html/rfc6531) extension. You can also set `email_validator.ALLOW_SMTPUTF8` to `False` to turn it off for all calls by default. -`check_deliverability=True`: Set to `False` to skip the domain name MX DNS record check. +`check_deliverability=True`: Set to `False` to skip the domain name MX DNS record check. You can also set `email_validator.CHECK_DELIVERABILITY` to `False` to turn it off for all calls by default. `allow_empty_local=False`: Set to `True` to allow an empty local part (i.e. `@example.com`), e.g. for validating Postfix aliases. `dns_resolver=None`: Pass an instance of [dns.resolver.Resolver](https://dnspython.readthedocs.io/en/latest/resolver-class.html) to control the DNS resolver including setting a timeout and [a cache](https://dnspython.readthedocs.io/en/latest/resolver-caching.html). The `caching_resolver` function shown above is a helper function to construct a dns.resolver.Resolver with a [LRUCache](https://dnspython.readthedocs.io/en/latest/resolver-caching.html#dns.resolver.LRUCache). Reuse the same resolver instance across calls to `validate_email` to make use of the cache. -`test_environment=False`: DNS-based deliverability checks are disabled and `test` and `subdomain.test` domain names are permitted (see below). +`test_environment=False`: DNS-based deliverability checks are disabled and `test` and `subdomain.test` domain names are permitted (see below). You can also set `email_validator.TEST_ENVIRONMENT` to `True` to turn it on for all calls by default. ### DNS timeout and cache @@ -146,10 +146,11 @@ while True: ### Test addresses -This library rejects email addresess that use the [Special Use Domain Names](https://www.iana.org/assignments/special-use-domain-names/special-use-domain-names.xhtml) `invalid`, `localhost`, `test`, and some others by raising `EmailUndeliverableError`. This is to protect your system from abuse: You probably don't want a user to be able to cause an email to be sent to `localhost`. However, in your non-production test environments you may want to use `@test` or `@myname.test` email addresses. There are two ways you can allow this: +This library rejects email addresess that use the [Special Use Domain Names](https://www.iana.org/assignments/special-use-domain-names/special-use-domain-names.xhtml) `invalid`, `localhost`, `test`, and some others by raising `EmailUndeliverableError`. This is to protect your system from abuse: You probably don't want a user to be able to cause an email to be sent to `localhost`. However, in your non-production test environments you may want to use `@test` or `@myname.test` email addresses. There are three ways you can allow this: -A. Add `test_environment=True` to the call to `validate_email` (see above). -B. Remove the special-use domain name that you want to use from `email_validator.SPECIAL_USE_DOMAIN_NAMES`: +1. Add `test_environment=True` to the call to `validate_email` (see above). +2. Set `email_validator.TEST_ENVIRONMENT` to `True`. +3. Remove the special-use domain name that you want to use from `email_validator.SPECIAL_USE_DOMAIN_NAMES`: ```python import email_validator diff --git a/email_validator/__init__.py b/email_validator/__init__.py index 3293bac..c82fca2 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -7,6 +7,12 @@ import dns.exception import idna # implements IDNA 2008; Python's codec is only IDNA 2003 +# Default values for keyword arguments. + +ALLOW_SMTPUTF8 = True +CHECK_DELIVERABILITY = True +TEST_ENVIRONMENT = False +DEFAULT_TIMEOUT = 15 # secs # Based on RFC 2822 section 3.2.4 / RFC 5322 section 3.2.3, these # characters are permitted in email addresses (not taking into @@ -112,8 +118,6 @@ DOT_ATOM_TEXT = DOT_ATOM_TEXT.decode("ascii") ATEXT_HOSTNAME = ATEXT_HOSTNAME.decode("ascii") -DEFAULT_TIMEOUT = 15 # secs - class EmailNotValidError(ValueError): """Parent class of all exceptions raised by this module.""" @@ -256,10 +260,10 @@ def caching_resolver(timeout=DEFAULT_TIMEOUT, cache=None): def validate_email( email, - allow_smtputf8=True, + allow_smtputf8=ALLOW_SMTPUTF8, allow_empty_local=False, - check_deliverability=True, - test_environment=False, + check_deliverability=CHECK_DELIVERABILITY, + test_environment=TEST_ENVIRONMENT, timeout=DEFAULT_TIMEOUT, dns_resolver=None ): From 810243112af0d38c3d317de80da685ca369736c0 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Wed, 27 Apr 2022 11:19:50 -0400 Subject: [PATCH 033/174] Rename some constants that were poorly named --- email_validator/__init__.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/email_validator/__init__.py b/email_validator/__init__.py index c82fca2..3d295ec 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -26,8 +26,8 @@ # addresses to also include three specific ranges of UTF8 defined in # RFC3629 section 4, which appear to be the Unicode code points from # U+0080 to U+10FFFF. -ATEXT_UTF8 = ATEXT + u"\u0080-\U0010FFFF" -DOT_ATOM_TEXT_UTF8 = '[' + ATEXT_UTF8 + ']+(?:\\.[' + ATEXT_UTF8 + ']+)*' +ATEXT_INTL = ATEXT + u"\u0080-\U0010FFFF" +DOT_ATOM_TEXT_INTL = '[' + ATEXT_INTL + ']+(?:\\.[' + ATEXT_INTL + ']+)*' # The domain part of the email address, after IDNA (ASCII) encoding, # must also satisfy the requirements of RFC 952/RFC 1123 which restrict @@ -400,11 +400,11 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals else: # The local part failed the ASCII check. Now try the extended internationalized requirements. - m = re.match(DOT_ATOM_TEXT_UTF8 + "\\Z", local) + m = re.match(DOT_ATOM_TEXT_INTL + "\\Z", local) if not m: # It's not a valid internationalized address either. Report which characters were not valid. bad_chars = ', '.join(sorted(set( - unicodedata.name(c, repr(c)) for c in local if not re.match(u"[" + (ATEXT if not allow_smtputf8 else ATEXT_UTF8) + u"]", c) + unicodedata.name(c, repr(c)) for c in local if not re.match(u"[" + (ATEXT if not allow_smtputf8 else ATEXT_INTL) + u"]", c) ))) raise EmailSyntaxError("The email address contains invalid characters before the @-sign: %s." % bad_chars) @@ -420,7 +420,7 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals # Check for unsafe characters. # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked - # by DOT_ATOM_TEXT_UTF8. + # by DOT_ATOM_TEXT_INTL. for i, c in enumerate(local): category = unicodedata.category(c) if category[0] in ("L", "N", "P", "S"): From dab623f851eb1ff81518285b5454ad90affb3798 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sun, 1 May 2022 17:25:13 -0400 Subject: [PATCH 034/174] Update pinned packages in test_requirements.txt The latest packages don't all work in Python 3.5/3.6, so the latest versions that work with Python 3.6 are pinned (by creating a virtualenv with Python 3.6 and then using pip freeze), Python 3.5 is dropped from Travis, and the README is updated to indicate testing starts with version 3.6. --- .travis.yml | 4 +--- README.md | 6 +++++- setup.cfg | 2 -- test_requirements.txt | 32 ++++++++++++++++++++++++++------ 4 files changed, 32 insertions(+), 12 deletions(-) diff --git a/.travis.yml b/.travis.yml index 529f3bc..0ce2828 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,10 +5,8 @@ cache: pip python: #- '2.7' -#- '3.4' -- '3.5' - '3.6' -#- '3.7' +- '3.7' - '3.8' - '3.9' diff --git a/README.md b/README.md index 7640db5..a1ebea9 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ email-validator: Validate Email Addresses ========================================= A robust email address syntax and deliverability validation library for -Python 3.5+ by [Joshua Tauberer](https://joshdata.me). +Python by [Joshua Tauberer](https://joshdata.me). This library validates that a string is of the form `name@example.com`. This is the sort of validation you would want for an email-based login form on @@ -32,6 +32,10 @@ This library was first published in 2015. The current version is 1.1.1 from `validate_email` has changed, but dict-style access to the validated address information still works, so it is backwards compatible.** +This library is tested with Python 3.6+ but should work in earlier versions: + +[![Build Status](https://app.travis-ci.com/JoshData/python-email-validator.svg?branch=main)](https://app.travis-ci.com/JoshData/python-email-validator) + Installation ------------ diff --git a/setup.cfg b/setup.cfg index f1c11d3..6526625 100644 --- a/setup.cfg +++ b/setup.cfg @@ -16,8 +16,6 @@ classifiers = Programming Language :: Python :: 2 Programming Language :: Python :: 2.7 Programming Language :: Python :: 3 - Programming Language :: Python :: 3.5 - Programming Language :: Python :: 3.6 Programming Language :: Python :: 3.7 Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 diff --git a/test_requirements.txt b/test_requirements.txt index 0658825..38dab84 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -1,6 +1,26 @@ -coverage==4.5.4 -docutils==0.15.2 -flake8==3.7.9 -pytest==5.2.2 -pytest-cov==2.8.1 -idna>=2.8 +# This file was generated by running +# pip install dnspython idna # from setup.cfg +# pip install pytest pytest-cov coverage flake8 +# pip freeze +# in a virtualenv with Python 3.6. (Some packages' latest versions +# are not compatible with Python 3.6, so we must pin versions for +# repeatable testing in earlier versions of Python.) +attrs==21.4.0 +coverage==6.2 +dnspython==2.2.1 +flake8==4.0.1 +idna==3.3 +importlib-metadata==4.2.0 +iniconfig==1.1.1 +mccabe==0.6.1 +packaging==21.3 +pluggy==1.0.0 +py==1.11.0 +pycodestyle==2.8.0 +pyflakes==2.4.0 +pyparsing==3.0.7 +pytest==7.0.1 +pytest-cov==3.0.0 +tomli==1.2.3 +typing_extensions==4.1.1 +zipp==3.6.0 From e92e395d02b4e630613c33f56b91c5ecc6085c6c Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sun, 1 May 2022 17:54:05 -0400 Subject: [PATCH 035/174] Add a note in the README about rejecting (comment) syntax --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a1ebea9..5f5f4c3 100644 --- a/README.md +++ b/README.md @@ -398,7 +398,8 @@ or likely to cause trouble: * The "quoted string" form of the local part of the email address (RFC 5321 4.1.2) is not permitted --- no one uses this anymore anyway. Quoted forms allow multiple @-signs, space characters, and other - troublesome conditions. + troublesome conditions. The unsual [(comment) syntax](https://github.com/JoshData/python-email-validator/issues/77) + in email addresses is also rejected. * The "literal" form for the domain part of an email address (an IP address) is not accepted --- no one uses this anymore anyway. From 95deaf8c7c7bc8315fffe980c37c6a14521f6f2f Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sun, 1 May 2022 17:43:57 -0400 Subject: [PATCH 036/174] Version 1.2.1 --- README.md | 20 +++++++++++++++----- setup.cfg | 2 +- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 5f5f4c3..67885a5 100644 --- a/README.md +++ b/README.md @@ -27,15 +27,25 @@ And this library does NOT permit obsolete forms of email addresses, so if you need strict validation against the email specs exactly, use [pyIsEmail](https://github.com/michaelherold/pyIsEmail). -This library was first published in 2015. The current version is 1.1.1 -(posted May 19, 2020). **Starting in version 1.1.0, the type of the value returned -from `validate_email` has changed, but dict-style access to the validated -address information still works, so it is backwards compatible.** - This library is tested with Python 3.6+ but should work in earlier versions: [![Build Status](https://app.travis-ci.com/JoshData/python-email-validator.svg?branch=main)](https://app.travis-ci.com/JoshData/python-email-validator) +--- + +This library was first published in 2015. The current version is 1.2.1 +(posted May 1, 2022). The main changes in version 1.2 are: + +* Rejecting domains with NULL MX records (when deliverability checks + are turned on). +* Rejecting unsafe unicode characters. (Some of these checks you should + be doing on all of your user inputs already!) +* Rejecting most special-use reserved domain names. A new `test_environment` + option is added for using `@*.test` domains. +* Some fixes in the tests. + +--- + Installation ------------ diff --git a/setup.cfg b/setup.cfg index 6526625..d32921b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = email_validator -version = 1.2.0 +version = 1.2.1 description = A robust email syntax and deliverability validation library. long_description = file: README.md long_description_content_type = text/markdown From 254cc3a3b71020b7153484d5eafc796773da5051 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sun, 1 May 2022 18:24:41 -0400 Subject: [PATCH 037/174] Update build instructions to remember how to authenticate with a pypi API token --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 67885a5..cde286f 100644 --- a/README.md +++ b/README.md @@ -439,7 +439,7 @@ pip3 install twine rm -rf dist python3 setup.py sdist python3 setup.py bdist_wheel -twine upload dist/* +twine upload dist/* # username: __token__ password: pypi API token git tag v1.0.XXX # replace with version in setup.cfg git push --tags ``` From c250c05125a07cf332b1f2a4de70215b8af9cd64 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sun, 1 May 2022 18:33:55 -0400 Subject: [PATCH 038/174] Missed a changelog entry --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index cde286f..ce48599 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,8 @@ This library was first published in 2015. The current version is 1.2.1 be doing on all of your user inputs already!) * Rejecting most special-use reserved domain names. A new `test_environment` option is added for using `@*.test` domains. +* New module-level attributes are added to override the default values + of the keyword arguments and the special-use domains list. * Some fixes in the tests. --- From b87f8d3ea68e46b78247e7dee3b85822cce464bc Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Thu, 12 May 2022 17:22:45 -0400 Subject: [PATCH 039/174] Revert documentation for "Create module attributes to set global default values for keyword argument options" It wasn't working. This partially reverts commit d64b2915cd48408686d32bc30eba327f2f9086f9. --- README.md | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index ce48599..b097197 100644 --- a/README.md +++ b/README.md @@ -136,16 +136,16 @@ The `validate_email` function also accepts the following keyword arguments `allow_smtputf8=True`: Set to `False` to prohibit internationalized addresses that would require the - [SMTPUTF8](https://tools.ietf.org/html/rfc6531) extension. You can also set `email_validator.ALLOW_SMTPUTF8` to `False` to turn it off for all calls by default. + [SMTPUTF8](https://tools.ietf.org/html/rfc6531) extension. -`check_deliverability=True`: Set to `False` to skip the domain name MX DNS record check. You can also set `email_validator.CHECK_DELIVERABILITY` to `False` to turn it off for all calls by default. +`check_deliverability=True`: Set to `False` to skip the domain name MX DNS record check. `allow_empty_local=False`: Set to `True` to allow an empty local part (i.e. `@example.com`), e.g. for validating Postfix aliases. `dns_resolver=None`: Pass an instance of [dns.resolver.Resolver](https://dnspython.readthedocs.io/en/latest/resolver-class.html) to control the DNS resolver including setting a timeout and [a cache](https://dnspython.readthedocs.io/en/latest/resolver-caching.html). The `caching_resolver` function shown above is a helper function to construct a dns.resolver.Resolver with a [LRUCache](https://dnspython.readthedocs.io/en/latest/resolver-caching.html#dns.resolver.LRUCache). Reuse the same resolver instance across calls to `validate_email` to make use of the cache. -`test_environment=False`: DNS-based deliverability checks are disabled and `test` and `subdomain.test` domain names are permitted (see below). You can also set `email_validator.TEST_ENVIRONMENT` to `True` to turn it on for all calls by default. +`test_environment=False`: DNS-based deliverability checks are disabled and `test` and `subdomain.test` domain names are permitted (see below). ### DNS timeout and cache @@ -162,11 +162,10 @@ while True: ### Test addresses -This library rejects email addresess that use the [Special Use Domain Names](https://www.iana.org/assignments/special-use-domain-names/special-use-domain-names.xhtml) `invalid`, `localhost`, `test`, and some others by raising `EmailUndeliverableError`. This is to protect your system from abuse: You probably don't want a user to be able to cause an email to be sent to `localhost`. However, in your non-production test environments you may want to use `@test` or `@myname.test` email addresses. There are three ways you can allow this: +This library rejects email addresess that use the [Special Use Domain Names](https://www.iana.org/assignments/special-use-domain-names/special-use-domain-names.xhtml) `invalid`, `localhost`, `test`, and some others by raising `EmailUndeliverableError`. This is to protect your system from abuse: You probably don't want a user to be able to cause an email to be sent to `localhost`. However, in your non-production test environments you may want to use `@test` or `@myname.test` email addresses. There are two ways you can allow this: -1. Add `test_environment=True` to the call to `validate_email` (see above). -2. Set `email_validator.TEST_ENVIRONMENT` to `True`. -3. Remove the special-use domain name that you want to use from `email_validator.SPECIAL_USE_DOMAIN_NAMES`: +A. Add `test_environment=True` to the call to `validate_email` (see above). +B. Remove the special-use domain name that you want to use from `email_validator.SPECIAL_USE_DOMAIN_NAMES`: ```python import email_validator From 65b27443603c4c577357ca29620c06474baf2e9e Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Mon, 1 Aug 2022 14:17:06 -0400 Subject: [PATCH 040/174] Recommend that check_deliverability be set to False for validation on login pages --- README.md | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index b097197..6a613bc 100644 --- a/README.md +++ b/README.md @@ -69,26 +69,27 @@ account in your application, you might do this: from email_validator import validate_email, EmailNotValidError email = "my+address@mydomain.tld" +is_new_account = True # False for login pages try: - # Validate & take the normalized form of the email - # address for all logic beyond this point (especially + # Check that the email address is valid. + validation = validate_email(email, check_deliverability=is_new_account) + + # Take the normalized form of the email address + # for all logic beyond this point (especially # before going to a database query where equality - # does not take into account normalization). - email = validate_email(email).email + # may not take into account Unicode normalization). + email = validation.email except EmailNotValidError as e: - # email is not valid, exception message is human-readable + # Email is not valid. + # The exception message is human-readable. print(str(e)) ``` This validates the address and gives you its normalized form. You should **put the normalized form in your database** and always normalize before -checking if an address is in your database. - -The validator will accept internationalized email addresses, but not all -mail systems can send email to an addresses with non-English characters in -the *local* part of the address (before the @-sign). See the `allow_smtputf8` -option below. +checking if an address is in your database. When using this in a login form, +set `check_deliverability` to `False` to avoid unnecessary DNS queries. Usage ----- @@ -138,7 +139,7 @@ The `validate_email` function also accepts the following keyword arguments require the [SMTPUTF8](https://tools.ietf.org/html/rfc6531) extension. -`check_deliverability=True`: Set to `False` to skip the domain name MX DNS record check. +`check_deliverability=True`: Set to `False` to skip the domain name MX DNS record check. It is recommended to pass `False` when performing validation for login pages since re-validation of the domain by querying DNS at every login is probably undesirable. `allow_empty_local=False`: Set to `True` to allow an empty local part (i.e. `@example.com`), e.g. for validating Postfix aliases. From dad7b6c24be828097e38a73a7337b3eee489c6c2 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sun, 1 May 2022 17:44:19 -0400 Subject: [PATCH 041/174] Check for 'v=spf1 -all' SPF records as a way to reject more bad domains --- README.md | 26 +++++++++++-------- email_validator/__init__.py | 52 +++++++++++++++++++++++++------------ tests/test_main.py | 4 +-- 3 files changed, 53 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index 6a613bc..7f3bbe9 100644 --- a/README.md +++ b/README.md @@ -107,7 +107,7 @@ addresses are allowed when passing a `bytes`) and: When an email address is not valid, `validate_email` raises either an `EmailSyntaxError` if the form of the address is invalid or an -`EmailUndeliverableError` if the domain name fails the DNS check. Both +`EmailUndeliverableError` if the domain name fails DNS checks. Both exception classes are subclasses of `EmailNotValidError`, which in turn is a subclass of `ValueError`. @@ -121,14 +121,17 @@ they will probably give you grief if you're using email for login. (See later in the document about that.) The validator checks that the domain name in the email address has a -(non-null) MX DNS record indicating that it is configured for email. +DNS MX record (except a NULL MX record) indicating that it can receive +email and that it does not have a reject-all SPF record (`v=spf1 -all`) +which would indicate that it cannot send email. +(A/AAAA-record MX fallback is also checked.) There is nothing to be gained by trying to actually contact an SMTP server, so that's not done here. For privacy, security, and practicality reasons servers are good at not giving away whether an address is deliverable or not: email addresses that appear to accept mail at first can bounce mail after a delay, and bounced mail may indicate a temporary failure of a good email address (sometimes an intentional failure, like -greylisting). (A/AAAA-record fallback is also checked.) +greylisting). ### Options @@ -139,7 +142,7 @@ The `validate_email` function also accepts the following keyword arguments require the [SMTPUTF8](https://tools.ietf.org/html/rfc6531) extension. -`check_deliverability=True`: Set to `False` to skip the domain name MX DNS record check. It is recommended to pass `False` when performing validation for login pages since re-validation of the domain by querying DNS at every login is probably undesirable. +`check_deliverability=True`: Set to `False` to skip DNS record checks for the domain. It is recommended to pass `False` when performing validation for login pages since re-validation of the domain by querying DNS at every login is probably undesirable. `allow_empty_local=False`: Set to `True` to allow an empty local part (i.e. `@example.com`), e.g. for validating Postfix aliases. @@ -324,9 +327,7 @@ ValidatedEmail( ascii_email='test@joshdata.me', ascii_local_part='test', ascii_domain='joshdata.me', - smtputf8=False, - mx=[(10, 'box.occams.info')], - mx_fallback_type=None) + smtputf8=False) ``` For the fictitious address `example@ツ.life`, which has an @@ -393,6 +394,7 @@ are: | `smtputf8` | A boolean indicating that the [SMTPUTF8](https://tools.ietf.org/html/rfc6531) feature of your mail relay will be required to transmit messages to this address because the local part of the address has non-ASCII characters (the local part cannot be IDNA-encoded). If `allow_smtputf8=False` is passed as an argument, this flag will always be false because an exception is raised if it would have been true. | | `mx` | A list of (priority, domain) tuples of MX records specified in the DNS for the domain (see [RFC 5321 section 5](https://tools.ietf.org/html/rfc5321#section-5)). May be `None` if the deliverability check could not be completed because of a temporary issue like a timeout. | | `mx_fallback_type` | `None` if an `MX` record is found. If no MX records are actually specified in DNS and instead are inferred, through an obsolete mechanism, from A or AAAA records, the value is the type of DNS record used instead (`A` or `AAAA`). May be `None` if the deliverability check could not be completed because of a temporary issue like a timeout. | +| `spf` | Any SPF record found while checking deliverability. | Assumptions ----------- @@ -402,10 +404,12 @@ strictly conform to the standards. Many email address forms are obsolete or likely to cause trouble: * The validator assumes the email address is intended to be - deliverable on the public Internet. The domain part - of the email address must be a resolvable domain name. - [Special Use Domain Names](https://www.iana.org/assignments/special-use-domain-names/special-use-domain-names.xhtml) - and their subdomains are always considered invalid (except see + usable on the public Internet. The domain part + of the email address must be a resolvable domain name + (without NULL MX or SPF -all DNS records) if deliverability + checks are turned on. + Most [Special Use Domain Names](https://www.iana.org/assignments/special-use-domain-names/special-use-domain-names.xhtml) + and their subdomains are considered invalid (except see the `test_environment` parameter above). * The "quoted string" form of the local part of the email address (RFC 5321 4.1.2) is not permitted --- no one uses this anymore anyway. diff --git a/email_validator/__init__.py b/email_validator/__init__.py index 3d295ec..853c942 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -356,9 +356,8 @@ def validate_email( deliverability_info = validate_email_deliverability( ret["domain"], ret["domain_i18n"], timeout, dns_resolver ) - if "mx" in deliverability_info: - ret.mx = deliverability_info["mx"] - ret.mx_fallback_type = deliverability_info["mx-fallback"] + for key, value in deliverability_info.items(): + setattr(ret, key, value) return ret @@ -588,6 +587,8 @@ def validate_email_deliverability(domain, domain_i18n, timeout=DEFAULT_TIMEOUT, dns_resolver = dns.resolver.get_default_resolver() dns_resolver.lifetime = timeout + deliverability_info = {} + def dns_resolver_resolve_shim(domain, record): try: # dns.resolver.Resolver.resolve is new to dnspython 2.x. @@ -611,39 +612,61 @@ def dns_resolver_resolve_shim(domain, record): raise dns.exception.Timeout() try: - # Try resolving for MX records and get them in sorted priority order - # as (priority, qname) pairs. + # Try resolving for MX records. response = dns_resolver_resolve_shim(domain, "MX") + + # For reporting, put them in priority order and remove the trailing dot in the qnames. mtas = sorted([(r.preference, str(r.exchange).rstrip('.')) for r in response]) - mx_fallback = None - # Do not permit delivery if there is only a "null MX" record (whose value is - # (0, ".") but we've stripped trailing dots, so the 'exchange' is just ""). + # Remove "null MX" records from the list (their value is (0, ".") but we've stripped + # trailing dots, so the 'exchange' is just ""). If there was only a null MX record, + # email is not deliverable. mtas = [(preference, exchange) for preference, exchange in mtas if exchange != ""] if len(mtas) == 0: raise EmailUndeliverableError("The domain name %s does not accept email." % domain_i18n) + deliverability_info["mx"] = mtas + deliverability_info["mx_fallback_type"] = None + except (dns.resolver.NoNameservers, dns.resolver.NXDOMAIN, dns.resolver.NoAnswer): # If there was no MX record, fall back to an A record. try: response = dns_resolver_resolve_shim(domain, "A") - mtas = [(0, str(r)) for r in response] - mx_fallback = "A" + deliverability_info["mx"] = [(0, str(r)) for r in response] + deliverability_info["mx_fallback_type"] = "A" except (dns.resolver.NoNameservers, dns.resolver.NXDOMAIN, dns.resolver.NoAnswer): # If there was no A record, fall back to an AAAA record. try: response = dns_resolver_resolve_shim(domain, "AAAA") - mtas = [(0, str(r)) for r in response] - mx_fallback = "AAAA" + deliverability_info["mx"] = [(0, str(r)) for r in response] + deliverability_info["mx_fallback_type"] = "AAAA" except (dns.resolver.NoNameservers, dns.resolver.NXDOMAIN, dns.resolver.NoAnswer): # If there was no MX, A, or AAAA record, then mail to # this domain is not deliverable. raise EmailUndeliverableError("The domain name %s does not exist." % domain_i18n) + try: + # Check for a SPF reject all ("v=spf1 -all") record which indicates + # no emails are sent from this domain, which like a NULL MX record + # would indicate that the domain is not used for email. + response = dns_resolver_resolve_shim(domain, "TXT") + for rec in response: + value = b"".join(rec.strings) + if value.startswith(b"v=spf1 "): + deliverability_info["spf"] = value.decode("ascii", errors='replace') + if value == b"v=spf1 -all": + raise EmailUndeliverableError("The domain name %s does not send email." % domain_i18n) + except dns.resolver.NoAnswer: + # No TXT records means there is no SPF policy, so we cannot take any action. + pass + except (dns.resolver.NoNameservers, dns.resolver.NXDOMAIN): + # Failure to resolve at this step will be ignored. + pass + except dns.exception.Timeout: # A timeout could occur for various reasons, so don't treat it as a failure. return { @@ -660,10 +683,7 @@ def dns_resolver_resolve_shim(domain, record): "There was an error while checking if the domain name in the email address is deliverable: " + str(e) ) - return { - "mx": mtas, - "mx-fallback": mx_fallback, - } + return deliverability_info def main(): diff --git a/tests/test_main.py b/tests/test_main.py index f1f731d..2579e2f 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -329,8 +329,8 @@ def test_dict_accessor(): def test_deliverability_found(): response = validate_email_deliverability('gmail.com', 'gmail.com') - assert response.keys() == {'mx', 'mx-fallback'} - assert response['mx-fallback'] is None + assert response.keys() == {'mx', 'mx_fallback_type', 'spf'} + assert response['mx_fallback_type'] is None assert len(response['mx']) > 1 assert len(response['mx'][0]) == 2 assert isinstance(response['mx'][0][0], int) From 8afa90dd70a70b70195a2d1f2d12258357a4232b Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Mon, 2 May 2022 17:54:34 -0400 Subject: [PATCH 042/174] Add the pyIsEmail test cases --- README.md | 16 ++-- tests/test_main.py | 199 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 209 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 7f3bbe9..81df18d 100644 --- a/README.md +++ b/README.md @@ -410,14 +410,18 @@ or likely to cause trouble: checks are turned on. Most [Special Use Domain Names](https://www.iana.org/assignments/special-use-domain-names/special-use-domain-names.xhtml) and their subdomains are considered invalid (except see - the `test_environment` parameter above). -* The "quoted string" form of the local part of the email address (RFC - 5321 4.1.2) is not permitted --- no one uses this anymore anyway. + the `test_environment` parameter above), if deliverability checks are + turned on. Domain names without a `.` are rejected as a syntax error + since no one has an email address directly at a TLD, and a missing + TLD is a common user error. +* Obsolete email syntaxes are rejected: + The "quoted string" form of the local part of the email address (RFC + 5321 4.1.2) is not permitted. Quoted forms allow multiple @-signs, space characters, and other troublesome conditions. The unsual [(comment) syntax](https://github.com/JoshData/python-email-validator/issues/77) - in email addresses is also rejected. -* The "literal" form for the domain part of an email address (an - IP address) is not accepted --- no one uses this anymore anyway. + is also rejected. The "literal" form for the domain part of an email address (an + IP address in brackets) is rejected. Other obsolete and deprecated syntaxes are + rejected. No one uses these forms anymore. Testing ------- diff --git a/tests/test_main.py b/tests/test_main.py index 2579e2f..bf2a12f 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -320,6 +320,205 @@ def test_email_test_domain_name_in_test_environment(): validate_email("anything@mycompany.test", test_environment=True) +# This is the pyIsEmail (https://github.com/michaelherold/pyIsEmail) test suite. +# +# The test data was extracted by: +# +# $ wget https://raw.githubusercontent.com/michaelherold/pyIsEmail/master/tests/data/tests.xml +# $ xmllint --xpath '/tests/test/address/text()' tests.xml > t1 +# $ xmllint --xpath "/tests/test[not(address='')]/diagnosis/text()" tests.xml > t2 +# +# tests = [] +# def fixup_char(c): +# if ord(c) >= 0x2400 and ord(c) <= 0x2432: +# c = chr(ord(c)-0x2400) +# return c +# for email, diagnosis in zip(open("t1"), open("t2")): +# email = email[:-1] # strip trailing \n but not more because trailing whitespace is significant +# email = "".join(fixup_char(c) for c in email).replace("&", "&") +# tests.append([email, diagnosis.strip()]) +# print(repr(tests).replace("'], ['", "'],\n['")) +@pytest.mark.parametrize( + ('email_input', 'status'), + [ + ['test', 'ISEMAIL_ERR_NODOMAIN'], + ['@', 'ISEMAIL_ERR_NOLOCALPART'], + ['test@', 'ISEMAIL_ERR_NODOMAIN'], + # ['test@io', 'ISEMAIL_VALID'], # we reject domains without a dot, knowing they are not deliverable + ['@io', 'ISEMAIL_ERR_NOLOCALPART'], + ['@iana.org', 'ISEMAIL_ERR_NOLOCALPART'], + ['test@iana.org', 'ISEMAIL_VALID'], + ['test@nominet.org.uk', 'ISEMAIL_VALID'], + ['test@about.museum', 'ISEMAIL_VALID'], + ['a@iana.org', 'ISEMAIL_VALID'], + ['test.test@iana.org', 'ISEMAIL_VALID'], + ['.test@iana.org', 'ISEMAIL_ERR_DOT_START'], + ['test.@iana.org', 'ISEMAIL_ERR_DOT_END'], + ['test..iana.org', 'ISEMAIL_ERR_CONSECUTIVEDOTS'], + ['test_exa-mple.com', 'ISEMAIL_ERR_NODOMAIN'], + ['!#$%&`*+/=?^`{|}~@iana.org', 'ISEMAIL_VALID'], + ['test\\@test@iana.org', 'ISEMAIL_ERR_EXPECTING_ATEXT'], + ['123@iana.org', 'ISEMAIL_VALID'], + ['test@123.com', 'ISEMAIL_VALID'], + ['abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghiklm@iana.org', 'ISEMAIL_VALID'], + ['abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghiklmn@iana.org', 'ISEMAIL_RFC5322_LOCAL_TOOLONG'], + ['test@abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghiklm.com', 'ISEMAIL_RFC5322_LABEL_TOOLONG'], + ['test@mason-dixon.com', 'ISEMAIL_VALID'], + ['test@-iana.org', 'ISEMAIL_ERR_DOMAINHYPHENSTART'], + ['test@iana-.com', 'ISEMAIL_ERR_DOMAINHYPHENEND'], + ['test@g--a.com', 'ISEMAIL_VALID'], + ['test@.iana.org', 'ISEMAIL_ERR_DOT_START'], + ['test@iana.org.', 'ISEMAIL_ERR_DOT_END'], + ['test@iana..com', 'ISEMAIL_ERR_CONSECUTIVEDOTS'], + ['abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghiklm@abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghikl.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghikl.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghij', 'ISEMAIL_RFC5322_TOOLONG'], + ['a@abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghikl.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghikl.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghikl.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefg.hij', 'ISEMAIL_RFC5322_TOOLONG'], + ['a@abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghikl.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghikl.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghikl.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefg.hijk', 'ISEMAIL_RFC5322_DOMAIN_TOOLONG'], + ['"test"@iana.org', 'ISEMAIL_RFC5321_QUOTEDSTRING'], + ['""@iana.org', 'ISEMAIL_RFC5321_QUOTEDSTRING'], + ['"""@iana.org', 'ISEMAIL_ERR_EXPECTING_ATEXT'], + ['"\\a"@iana.org', 'ISEMAIL_RFC5321_QUOTEDSTRING'], + ['"\\""@iana.org', 'ISEMAIL_RFC5321_QUOTEDSTRING'], + ['"\\"@iana.org', 'ISEMAIL_ERR_UNCLOSEDQUOTEDSTR'], + ['"\\\\"@iana.org', 'ISEMAIL_RFC5321_QUOTEDSTRING'], + ['test"@iana.org', 'ISEMAIL_ERR_EXPECTING_ATEXT'], + ['"test@iana.org', 'ISEMAIL_ERR_UNCLOSEDQUOTEDSTR'], + ['"test"test@iana.org', 'ISEMAIL_ERR_ATEXT_AFTER_QS'], + ['test"text"@iana.org', 'ISEMAIL_ERR_EXPECTING_ATEXT'], + ['"test""test"@iana.org', 'ISEMAIL_ERR_EXPECTING_ATEXT'], + ['"test"."test"@iana.org', 'ISEMAIL_DEPREC_LOCALPART'], + ['"test\\ test"@iana.org', 'ISEMAIL_RFC5321_QUOTEDSTRING'], + ['"test".test@iana.org', 'ISEMAIL_DEPREC_LOCALPART'], + ['"test\x00"@iana.org', 'ISEMAIL_ERR_EXPECTING_QTEXT'], + ['"test\\\x00"@iana.org', 'ISEMAIL_DEPREC_QP'], + ['"abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz abcdefghj"@iana.org', 'ISEMAIL_RFC5322_LOCAL_TOOLONG'], + ['"abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz abcdefg\\h"@iana.org', 'ISEMAIL_RFC5322_LOCAL_TOOLONG'], + ['test@[255.255.255.255]', 'ISEMAIL_RFC5321_ADDRESSLITERAL'], + ['test@a[255.255.255.255]', 'ISEMAIL_ERR_EXPECTING_ATEXT'], + ['test@[255.255.255]', 'ISEMAIL_RFC5322_DOMAINLITERAL'], + ['test@[255.255.255.255.255]', 'ISEMAIL_RFC5322_DOMAINLITERAL'], + ['test@[255.255.255.256]', 'ISEMAIL_RFC5322_DOMAINLITERAL'], + ['test@[1111:2222:3333:4444:5555:6666:7777:8888]', 'ISEMAIL_RFC5322_DOMAINLITERAL'], + ['test@[IPv6:1111:2222:3333:4444:5555:6666:7777]', 'ISEMAIL_RFC5322_IPV6_GRPCOUNT'], + ['test@[IPv6:1111:2222:3333:4444:5555:6666:7777:8888]', 'ISEMAIL_RFC5321_ADDRESSLITERAL'], + ['test@[IPv6:1111:2222:3333:4444:5555:6666:7777:8888:9999]', 'ISEMAIL_RFC5322_IPV6_GRPCOUNT'], + ['test@[IPv6:1111:2222:3333:4444:5555:6666:7777:888G]', 'ISEMAIL_RFC5322_IPV6_BADCHAR'], + ['test@[IPv6:1111:2222:3333:4444:5555:6666::8888]', 'ISEMAIL_RFC5321_IPV6DEPRECATED'], + ['test@[IPv6:1111:2222:3333:4444:5555::8888]', 'ISEMAIL_RFC5321_ADDRESSLITERAL'], + ['test@[IPv6:1111:2222:3333:4444:5555:6666::7777:8888]', 'ISEMAIL_RFC5322_IPV6_MAXGRPS'], + ['test@[IPv6::3333:4444:5555:6666:7777:8888]', 'ISEMAIL_RFC5322_IPV6_COLONSTRT'], + ['test@[IPv6:::3333:4444:5555:6666:7777:8888]', 'ISEMAIL_RFC5321_ADDRESSLITERAL'], + ['test@[IPv6:1111::4444:5555::8888]', 'ISEMAIL_RFC5322_IPV6_2X2XCOLON'], + ['test@[IPv6:::]', 'ISEMAIL_RFC5321_ADDRESSLITERAL'], + ['test@[IPv6:1111:2222:3333:4444:5555:255.255.255.255]', 'ISEMAIL_RFC5322_IPV6_GRPCOUNT'], + ['test@[IPv6:1111:2222:3333:4444:5555:6666:255.255.255.255]', 'ISEMAIL_RFC5321_ADDRESSLITERAL'], + ['test@[IPv6:1111:2222:3333:4444:5555:6666:7777:255.255.255.255]', 'ISEMAIL_RFC5322_IPV6_GRPCOUNT'], + ['test@[IPv6:1111:2222:3333:4444::255.255.255.255]', 'ISEMAIL_RFC5321_ADDRESSLITERAL'], + ['test@[IPv6:1111:2222:3333:4444:5555:6666::255.255.255.255]', 'ISEMAIL_RFC5322_IPV6_MAXGRPS'], + ['test@[IPv6:1111:2222:3333:4444:::255.255.255.255]', 'ISEMAIL_RFC5322_IPV6_2X2XCOLON'], + ['test@[IPv6::255.255.255.255]', 'ISEMAIL_RFC5322_IPV6_COLONSTRT'], + [' test @iana.org', 'ISEMAIL_DEPREC_CFWS_NEAR_AT'], + ['test@ iana .com', 'ISEMAIL_DEPREC_CFWS_NEAR_AT'], + ['test . test@iana.org', 'ISEMAIL_DEPREC_FWS'], + ['\r\n test@iana.org', 'ISEMAIL_CFWS_FWS'], + ['\r\n \r\n test@iana.org', 'ISEMAIL_DEPREC_FWS'], + ['(comment)test@iana.org', 'ISEMAIL_CFWS_COMMENT'], + ['((comment)test@iana.org', 'ISEMAIL_ERR_UNCLOSEDCOMMENT'], + ['(comment(comment))test@iana.org', 'ISEMAIL_CFWS_COMMENT'], + ['test@(comment)iana.org', 'ISEMAIL_DEPREC_CFWS_NEAR_AT'], + ['test(comment)test@iana.org', 'ISEMAIL_ERR_ATEXT_AFTER_CFWS'], + ['test@(comment)[255.255.255.255]', 'ISEMAIL_DEPREC_CFWS_NEAR_AT'], + ['(comment)abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghiklm@iana.org', 'ISEMAIL_CFWS_COMMENT'], + ['test@(comment)abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghikl.com', 'ISEMAIL_DEPREC_CFWS_NEAR_AT'], + ['(comment)test@abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghik.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghik.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijk.abcdefghijklmnopqrstuvwxyzabcdefghijk.abcdefghijklmnopqrstu', 'ISEMAIL_CFWS_COMMENT'], + ['test@iana.org\n', 'ISEMAIL_ERR_EXPECTING_ATEXT'], + ['test@xn--hxajbheg2az3al.xn--jxalpdlp', 'ISEMAIL_VALID'], + ['xn--test@iana.org', 'ISEMAIL_VALID'], + ['test@iana.org-', 'ISEMAIL_ERR_DOMAINHYPHENEND'], + ['"test@iana.org', 'ISEMAIL_ERR_UNCLOSEDQUOTEDSTR'], + ['(test@iana.org', 'ISEMAIL_ERR_UNCLOSEDCOMMENT'], + ['test@(iana.org', 'ISEMAIL_ERR_UNCLOSEDCOMMENT'], + ['test@[1.2.3.4', 'ISEMAIL_ERR_UNCLOSEDDOMLIT'], + ['"test\\"@iana.org', 'ISEMAIL_ERR_UNCLOSEDQUOTEDSTR'], + ['(comment\\)test@iana.org', 'ISEMAIL_ERR_UNCLOSEDCOMMENT'], + ['test@iana.org(comment\\)', 'ISEMAIL_ERR_UNCLOSEDCOMMENT'], + ['test@iana.org(comment\\', 'ISEMAIL_ERR_BACKSLASHEND'], + ['test@[RFC-5322-domain-literal]', 'ISEMAIL_RFC5322_DOMAINLITERAL'], + ['test@[RFC-5322]-domain-literal]', 'ISEMAIL_ERR_ATEXT_AFTER_DOMLIT'], + ['test@[RFC-5322-[domain-literal]', 'ISEMAIL_ERR_EXPECTING_DTEXT'], + ['test@[RFC-5322-\\\x07-domain-literal]', 'ISEMAIL_RFC5322_DOMLIT_OBSDTEXT'], + ['test@[RFC-5322-\\\t-domain-literal]', 'ISEMAIL_RFC5322_DOMLIT_OBSDTEXT'], + ['test@[RFC-5322-\\]-domain-literal]', 'ISEMAIL_RFC5322_DOMLIT_OBSDTEXT'], + ['test@[RFC-5322-domain-literal\\]', 'ISEMAIL_ERR_UNCLOSEDDOMLIT'], + ['test@[RFC-5322-domain-literal\\', 'ISEMAIL_ERR_BACKSLASHEND'], + ['test@[RFC 5322 domain literal]', 'ISEMAIL_RFC5322_DOMAINLITERAL'], + ['test@[RFC-5322-domain-literal] (comment)', 'ISEMAIL_RFC5322_DOMAINLITERAL'], + ['\x7f@iana.org', 'ISEMAIL_ERR_EXPECTING_ATEXT'], + ['test@\x7f.org', 'ISEMAIL_ERR_EXPECTING_ATEXT'], + ['"\x7f"@iana.org', 'ISEMAIL_DEPREC_QTEXT'], + ['"\\\x7f"@iana.org', 'ISEMAIL_DEPREC_QP'], + ['(\x7f)test@iana.org', 'ISEMAIL_DEPREC_CTEXT'], + ['test@iana.org\r', 'ISEMAIL_ERR_CR_NO_LF'], + ['\rtest@iana.org', 'ISEMAIL_ERR_CR_NO_LF'], + ['"\rtest"@iana.org', 'ISEMAIL_ERR_CR_NO_LF'], + ['(\r)test@iana.org', 'ISEMAIL_ERR_CR_NO_LF'], + ['test@iana.org(\r)', 'ISEMAIL_ERR_CR_NO_LF'], + ['\ntest@iana.org', 'ISEMAIL_ERR_EXPECTING_ATEXT'], + ['"\n"@iana.org', 'ISEMAIL_ERR_EXPECTING_QTEXT'], + ['"\\\n"@iana.org', 'ISEMAIL_DEPREC_QP'], + ['(\n)test@iana.org', 'ISEMAIL_ERR_EXPECTING_CTEXT'], + ['\x07@iana.org', 'ISEMAIL_ERR_EXPECTING_ATEXT'], + ['test@\x07.org', 'ISEMAIL_ERR_EXPECTING_ATEXT'], + ['"\x07"@iana.org', 'ISEMAIL_DEPREC_QTEXT'], + ['"\\\x07"@iana.org', 'ISEMAIL_DEPREC_QP'], + ['(\x07)test@iana.org', 'ISEMAIL_DEPREC_CTEXT'], + ['\r\ntest@iana.org', 'ISEMAIL_ERR_FWS_CRLF_END'], + ['\r\n \r\ntest@iana.org', 'ISEMAIL_ERR_FWS_CRLF_END'], + [' \r\ntest@iana.org', 'ISEMAIL_ERR_FWS_CRLF_END'], + [' \r\n test@iana.org', 'ISEMAIL_CFWS_FWS'], + [' \r\n \r\ntest@iana.org', 'ISEMAIL_ERR_FWS_CRLF_END'], + [' \r\n\r\ntest@iana.org', 'ISEMAIL_ERR_FWS_CRLF_X2'], + [' \r\n\r\n test@iana.org', 'ISEMAIL_ERR_FWS_CRLF_X2'], + ['test@iana.org\r\n ', 'ISEMAIL_CFWS_FWS'], + ['test@iana.org\r\n \r\n ', 'ISEMAIL_DEPREC_FWS'], + ['test@iana.org\r\n', 'ISEMAIL_ERR_FWS_CRLF_END'], + ['test@iana.org\r\n \r\n', 'ISEMAIL_ERR_FWS_CRLF_END'], + ['test@iana.org \r\n', 'ISEMAIL_ERR_FWS_CRLF_END'], + ['test@iana.org \r\n ', 'ISEMAIL_CFWS_FWS'], + ['test@iana.org \r\n \r\n', 'ISEMAIL_ERR_FWS_CRLF_END'], + ['test@iana.org \r\n\r\n', 'ISEMAIL_ERR_FWS_CRLF_X2'], + ['test@iana.org \r\n\r\n ', 'ISEMAIL_ERR_FWS_CRLF_X2'], + [' test@iana.org', 'ISEMAIL_CFWS_FWS'], + ['test@iana.org ', 'ISEMAIL_CFWS_FWS'], + ['test@[IPv6:1::2:]', 'ISEMAIL_RFC5322_IPV6_COLONEND'], + ['"test\\©"@iana.org', 'ISEMAIL_ERR_EXPECTING_QPAIR'], + ['test@iana/icann.org', 'ISEMAIL_RFC5322_DOMAIN'], + ['test.(comment)test@iana.org', 'ISEMAIL_DEPREC_COMMENT'] + ] +) +def test_pyisemail_tests(email_input, status): + if status == "ISEMAIL_VALID": + # All standard email address forms should not raise an exception. + validate_email(email_input, test_environment=True) + elif "_ERR_" in status or "_TOOLONG" in status \ + or "_CFWS_FWS" in status or "_CFWS_COMMENT" in status \ + or "_IPV6" in status or status == "ISEMAIL_RFC5322_DOMAIN": + # Invalid syntax, extranous whitespace, and "(comments)" should be rejected. + # The _IPV6_ diagnoses appear to represent syntactically invalid domain literals. + # The ISEMAIL_RFC5322_DOMAIN diagnosis appears to be a syntactically invalid domain. + with pytest.raises(EmailSyntaxError): + validate_email(email_input, test_environment=True) + elif "_DEPREC_" in status \ + or "RFC5321_QUOTEDSTRING" in status \ + or "DOMAINLITERAL" in status or "_DOMLIT_" in status or "_ADDRESSLITERAL" in status: + # Quoted strings in the local part, domain literals (IP addresses in brackets), + # and other deprecated syntax are valid email addresses and are accepted by pyIsEmail, + # but we reject them. + with pytest.raises(EmailSyntaxError): + validate_email(email_input, test_environment=True) + else: + raise ValueError("status {} is not recognized".format(status)) + + def test_dict_accessor(): input_email = "testaddr@example.tld" valid_email = validate_email(input_email, check_deliverability=False) From 35d9a18508e6543a368aa9cad7a8a414697159d5 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Thu, 12 May 2022 16:50:14 -0400 Subject: [PATCH 043/174] The module-level attributes added in d64b2915cd48408686d32bc30eba327f2f9086f9 to set global defaults were not working This fixes the problem and restores the documentation that was previously reverted in b87f8d3ea68e46b78247e7dee3b85822cce464bc. --- README.md | 13 +++++++------ email_validator/__init__.py | 24 +++++++++++++++++++----- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 81df18d..1a17c77 100644 --- a/README.md +++ b/README.md @@ -140,16 +140,16 @@ The `validate_email` function also accepts the following keyword arguments `allow_smtputf8=True`: Set to `False` to prohibit internationalized addresses that would require the - [SMTPUTF8](https://tools.ietf.org/html/rfc6531) extension. + [SMTPUTF8](https://tools.ietf.org/html/rfc6531) extension. You can also set `email_validator.ALLOW_SMTPUTF8` to `False` to turn it off for all calls by default. -`check_deliverability=True`: Set to `False` to skip DNS record checks for the domain. It is recommended to pass `False` when performing validation for login pages since re-validation of the domain by querying DNS at every login is probably undesirable. +`check_deliverability=True`: Set to `False` to skip DNS record checks for the domain. It is recommended to pass `False` when performing validation for login pages since re-validation of the domain by querying DNS at every login is probably undesirable. You can also set `email_validator.CHECK_DELIVERABILITY` to `False` to turn this off for all calls by default. `allow_empty_local=False`: Set to `True` to allow an empty local part (i.e. `@example.com`), e.g. for validating Postfix aliases. `dns_resolver=None`: Pass an instance of [dns.resolver.Resolver](https://dnspython.readthedocs.io/en/latest/resolver-class.html) to control the DNS resolver including setting a timeout and [a cache](https://dnspython.readthedocs.io/en/latest/resolver-caching.html). The `caching_resolver` function shown above is a helper function to construct a dns.resolver.Resolver with a [LRUCache](https://dnspython.readthedocs.io/en/latest/resolver-caching.html#dns.resolver.LRUCache). Reuse the same resolver instance across calls to `validate_email` to make use of the cache. -`test_environment=False`: DNS-based deliverability checks are disabled and `test` and `subdomain.test` domain names are permitted (see below). +`test_environment=False`: DNS-based deliverability checks are disabled and `test` and `subdomain.test` domain names are permitted (see below). You can also set `email_validator.TEST_ENVIRONMENT` to `True` to turn it on for all calls by default. ### DNS timeout and cache @@ -166,10 +166,11 @@ while True: ### Test addresses -This library rejects email addresess that use the [Special Use Domain Names](https://www.iana.org/assignments/special-use-domain-names/special-use-domain-names.xhtml) `invalid`, `localhost`, `test`, and some others by raising `EmailUndeliverableError`. This is to protect your system from abuse: You probably don't want a user to be able to cause an email to be sent to `localhost`. However, in your non-production test environments you may want to use `@test` or `@myname.test` email addresses. There are two ways you can allow this: +This library rejects email addresess that use the [Special Use Domain Names](https://www.iana.org/assignments/special-use-domain-names/special-use-domain-names.xhtml) `invalid`, `localhost`, `test`, and some others by raising `EmailUndeliverableError`. This is to protect your system from abuse: You probably don't want a user to be able to cause an email to be sent to `localhost`. However, in your non-production test environments you may want to use `@test` or `@myname.test` email addresses. There are three ways you can allow this: -A. Add `test_environment=True` to the call to `validate_email` (see above). -B. Remove the special-use domain name that you want to use from `email_validator.SPECIAL_USE_DOMAIN_NAMES`: +1. Add `test_environment=True` to the call to `validate_email` (see above). +2. Set `email_validator.TEST_ENVIRONMENT` to `True`. +3. Remove the special-use domain name that you want to use from `email_validator.SPECIAL_USE_DOMAIN_NAMES`: ```python import email_validator diff --git a/email_validator/__init__.py b/email_validator/__init__.py index 853c942..db0150c 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -251,7 +251,9 @@ def __get_length_reason(addr, utf8=False, limit=EMAIL_MAX_LENGTH): return reason.format(prefix, diff, suffix) -def caching_resolver(timeout=DEFAULT_TIMEOUT, cache=None): +def caching_resolver(timeout=None, cache=None): + if timeout is None: + timeout = DEFAULT_TIMEOUT resolver = dns.resolver.Resolver() resolver.cache = cache or dns.resolver.LRUCache() resolver.lifetime = timeout # timeout, in seconds @@ -260,11 +262,11 @@ def caching_resolver(timeout=DEFAULT_TIMEOUT, cache=None): def validate_email( email, - allow_smtputf8=ALLOW_SMTPUTF8, + allow_smtputf8=None, allow_empty_local=False, - check_deliverability=CHECK_DELIVERABILITY, - test_environment=TEST_ENVIRONMENT, - timeout=DEFAULT_TIMEOUT, + check_deliverability=None, + test_environment=None, + timeout=None, dns_resolver=None ): """ @@ -273,6 +275,16 @@ def validate_email( but if bytes it must be ASCII-only. """ + # Fill in default values of arguments. + if allow_smtputf8 is None: + allow_smtputf8 = ALLOW_SMTPUTF8 + if check_deliverability is None: + check_deliverability = CHECK_DELIVERABILITY + if test_environment is None: + test_environment = TEST_ENVIRONMENT + if timeout is None: + timeout = DEFAULT_TIMEOUT + # Allow email to be a str or bytes instance. If bytes, # it must be ASCII because that's how the bytes work # on the wire with SMTP. @@ -579,6 +591,8 @@ def validate_email_domain_part(domain, test_environment=False): def validate_email_deliverability(domain, domain_i18n, timeout=DEFAULT_TIMEOUT, dns_resolver=None): # Check that the domain resolves to an MX record. If there is no MX record, # try an A or AAAA record which is a deprecated fallback for deliverability. + # (Note that changing the DEFAULT_TIMEOUT module-level attribute + # will not change the default value of this method's timeout argument.) # If no dns.resolver.Resolver was given, get dnspython's default resolver. # Override the default resolver's timeout. This may affect other uses of From 2ad9b1bced5e46384adacf81e0f51026d1339ec9 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Thu, 12 May 2022 16:52:27 -0400 Subject: [PATCH 044/174] Mark the arguments of the public methods as keyword-only except the email argument to validate_email The email argument should be positional-only, but we're still building with Python 3.6-7 which don't support that yet. --- email_validator/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/email_validator/__init__.py b/email_validator/__init__.py index db0150c..f835d71 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -251,7 +251,7 @@ def __get_length_reason(addr, utf8=False, limit=EMAIL_MAX_LENGTH): return reason.format(prefix, diff, suffix) -def caching_resolver(timeout=None, cache=None): +def caching_resolver(*, timeout=None, cache=None): if timeout is None: timeout = DEFAULT_TIMEOUT resolver = dns.resolver.Resolver() @@ -262,6 +262,8 @@ def caching_resolver(timeout=None, cache=None): def validate_email( email, + # /, # not supported in Python 3.6, 3.7 + *, allow_smtputf8=None, allow_empty_local=False, check_deliverability=None, From a906eb512a42c5550ff117c692fb164a4b6b050b Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Tue, 12 Jul 2022 07:18:38 -0400 Subject: [PATCH 045/174] Change special use domain names to raise EmailSyntaxError instead of EmailUndeliverableError This way all DNS-based checks raise EmailUndeliverableError and all non-DNS-based checks raise EmailSyntaxError. This was suggested by someone in GitHub issues although I can't find that anymore. --- README.md | 10 ++++------ email_validator/__init__.py | 10 ++++------ tests/test_main.py | 2 +- 3 files changed, 9 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 1a17c77..b08a6a9 100644 --- a/README.md +++ b/README.md @@ -166,7 +166,7 @@ while True: ### Test addresses -This library rejects email addresess that use the [Special Use Domain Names](https://www.iana.org/assignments/special-use-domain-names/special-use-domain-names.xhtml) `invalid`, `localhost`, `test`, and some others by raising `EmailUndeliverableError`. This is to protect your system from abuse: You probably don't want a user to be able to cause an email to be sent to `localhost`. However, in your non-production test environments you may want to use `@test` or `@myname.test` email addresses. There are three ways you can allow this: +This library rejects email addresess that use the [Special Use Domain Names](https://www.iana.org/assignments/special-use-domain-names/special-use-domain-names.xhtml) `invalid`, `localhost`, `test`, and some others by raising `EmailSyntaxError`. This is to protect your system from abuse: You probably don't want a user to be able to cause an email to be sent to `localhost`. However, in your non-production test environments you may want to use `@test` or `@myname.test` email addresses. There are three ways you can allow this: 1. Add `test_environment=True` to the call to `validate_email` (see above). 2. Set `email_validator.TEST_ENVIRONMENT` to `True`. @@ -410,11 +410,9 @@ or likely to cause trouble: (without NULL MX or SPF -all DNS records) if deliverability checks are turned on. Most [Special Use Domain Names](https://www.iana.org/assignments/special-use-domain-names/special-use-domain-names.xhtml) - and their subdomains are considered invalid (except see - the `test_environment` parameter above), if deliverability checks are - turned on. Domain names without a `.` are rejected as a syntax error - since no one has an email address directly at a TLD, and a missing - TLD is a common user error. + and their subdomains and + domain names without a `.` are rejected as a syntax error + (except see the `test_environment` parameter above). * Obsolete email syntaxes are rejected: The "quoted string" form of the local part of the email address (RFC 5321 4.1.2) is not permitted. diff --git a/email_validator/__init__.py b/email_validator/__init__.py index f835d71..d453423 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -559,8 +559,7 @@ def validate_email_domain_part(domain, test_environment=False): if "." not in ascii_domain and not (ascii_domain == "test" and test_environment): raise EmailSyntaxError("The domain name %s is not valid. It should have a period." % domain_i18n) - # Check special-use and reserved domain names. Raise these as - # deliverability errors since they are syntactically valid. + # Check special-use and reserved domain names. # Some might fail DNS-based deliverability checks, but that # can be turned off, so we should fail them all sooner. for d in SPECIAL_USE_DOMAIN_NAMES: @@ -569,12 +568,11 @@ def validate_email_domain_part(domain, test_environment=False): continue if ascii_domain == d or ascii_domain.endswith("." + d): - raise EmailUndeliverableError("The domain name %s is a special-use or reserved name that cannot be used with email." % domain_i18n) + raise EmailSyntaxError("The domain name %s is a special-use or reserved name that cannot be used with email." % domain_i18n) - # We also know that all TLDs currently end with a letter, and - # we'll consider that a non-DNS based deliverability check. + # We also know that all TLDs currently end with a letter. if not re.search(r"[A-Za-z]\Z", ascii_domain): - raise EmailUndeliverableError( + raise EmailSyntaxError( "The domain name %s is not valid. It is not within a valid top-level domain." % domain_i18n ) diff --git a/tests/test_main.py b/tests/test_main.py index bf2a12f..1ca6d76 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -271,7 +271,7 @@ def test_email_invalid_syntax(email_input, error_msg): def test_email_invalid_reserved_domain(email_input): # Since these all fail deliverabiltiy from a static list, # DNS deliverability checks do not arise. - with pytest.raises(EmailUndeliverableError) as exc_info: + with pytest.raises(EmailSyntaxError) as exc_info: validate_email(email_input) # print(f'({email_input!r}, {str(exc_info.value)!r}),') assert "is a special-use or reserved name" in str(exc_info.value) From 6c317d20287acc7c8c59d25a936cca42be3e067d Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Tue, 12 Jul 2022 07:35:38 -0400 Subject: [PATCH 046/174] Add recent changelog to README --- README.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b08a6a9..908436c 100644 --- a/README.md +++ b/README.md @@ -42,10 +42,16 @@ This library was first published in 2015. The current version is 1.2.1 be doing on all of your user inputs already!) * Rejecting most special-use reserved domain names. A new `test_environment` option is added for using `@*.test` domains. -* New module-level attributes are added to override the default values - of the keyword arguments and the special-use domains list. * Some fixes in the tests. +Unreleased changes in development: + +* Deliverability checks now check for 'v=spf1 -all' SPF records as a way to reject more bad domains. +* Special use domain names now raise EmailSyntaxError instead of EmailUndeliverableError. +* New module-level attributes are added to override the default values of the keyword arguments and the special-use domains list. +* The keyword arguments of the public methods are now marked as keyword-only. +* [pyIsEmail](https://github.com/michaelherold/pyIsEmail)'s test cases are added to the tests. + --- Installation From 37052e02830ebe591c81bd6c3dc47fc8077a016c Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Tue, 12 Jul 2022 07:46:29 -0400 Subject: [PATCH 047/174] Update Travis to build with Python 3.10 and add 3.10 to the classifiers, and drop 3.7 and 3.8 whose builds are failing Fixes #83. --- .travis.yml | 8 ++++---- setup.cfg | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index 0ce2828..d0d8d02 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,14 +1,14 @@ os: linux -dist: xenial +dist: bionic language: python cache: pip python: -#- '2.7' - '3.6' -- '3.7' -- '3.8' +#- '3.7' +#- '3.8' - '3.9' +- '3.10' install: - make install diff --git a/setup.cfg b/setup.cfg index d32921b..c570779 100644 --- a/setup.cfg +++ b/setup.cfg @@ -19,6 +19,7 @@ classifiers = Programming Language :: Python :: 3.7 Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 + Programming Language :: Python :: 3.10 Topic :: Software Development :: Libraries :: Python Modules keywords = email address validator From f728d882819263b12532f2ca9034801c085422a0 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sat, 3 Sep 2022 16:15:29 -0400 Subject: [PATCH 048/174] Make a CHANGELOG file --- CHANGELOG.md | 105 +++++++++++++++++++++++++++++++++++++++++++++++++++ README.md | 21 +---------- 2 files changed, 106 insertions(+), 20 deletions(-) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..425041c --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,105 @@ +Unreleased changes in development +--------------------------------- + +* Deliverability checks now check for 'v=spf1 -all' SPF records as a way to reject more bad domains. +* Special use domain names now raise EmailSyntaxError instead of EmailUndeliverableError since they are performed even if check_deliverability is off. +* New module-level attributes are added to override the default values of the keyword arguments and the special-use domains list. +* The keyword arguments of the public methods are now marked as keyword-only. +* [pyIsEmail](https://github.com/michaelherold/pyIsEmail)'s test cases are added to the tests. +* Recommend that check_deliverability be set to False for validation on login pages. + +Version 1.2.1 (May 1, 2022) +--------------------------- + +* example.com/net/org are removed from the special-use reserved domain names list so that they do not raise exceptions if check_deliverability is off. +* Improved README. + +Verison 1.2.0 (April 24, 2022) +------------------------------ + +* Reject domains with NULL MX records (when deliverability checks + are turned on). +* Reject unsafe unicode characters. (Some of these checks you should + be doing on all of your user inputs already!) +* Reject most special-use reserved domain names with EmailUndeliverableError. A new `test_environment` option is added for using `@*.test` domains. +* Improved safety of exception text by not repeating an unsafe input character in the message. +* Minor fixes in tests. +* Invoking the module as a standalone program now caches DNS queries. +* Improved README. + +Version 1.1.3 (June 12, 2021) +----------------------------- + +* Allow passing a custom dns_resolver so that a DNS cache and a custom timeout can be set. + +Version 1.1.2 (Nov 5, 2020) +--------------------------- + +* Fix invoking the module as a standalone program. +* Fix deprecation warning in Python 3.8. +* Code improvements. +* Improved README. + +Version 1.1.1 (May 19, 2020) +---------------------------- + +* Fix exception when DNS queries time-out. +* Improved README. + +Version 1.1.0 (Spril 30, 2020) +------------------------------ + +* The main function now returns an object with attributes rather than a dict with keys, but accessing the object in the old way is still supported. +* Added overall email address length checks. +* Minor tweak to regular expressions. +* Improved error messages. +* Added tests. +* Linted source code files; changed README to Markdown. + +Version 1.0.5 (Oct 18, 2019) +---------------------------- + +* Prevent resolving domain names as if they were not fully qualified using a local search domain settings. + +Version 1.0.4 (May 2, 2019) +--------------------------- + +* Added a timeout argument for DNS queries. +* The wheel distribution is now a universal wheel. +* Improved README. + +Version 1.0.3 (Sept 12, 2017) +----------------------------- + +* Added a wheel distribution for easier installation. + +Version 1.0.2 (Dec 30, 2016) +---------------------------- + +* Fix dnspython package name in Python 3. +* Improved README. + +Version 1.0.1 (March 6, 2016) +----------------------------- + +* Fixed minor errors. + +Version 1.0.0 (Sept 5, 2015) +---------------------------- + +* Fail domains with a leading period. +* Improved error messages. +* Added tests. + +Version 0.5.0 (June 15, 2015) +----------------------------- + +* Use IDNA 2008 instead of IDNA 2003 and use the idna package's UTS46 normalization instead of our own. +* Fixes for Python 2. +* Improved error messages. +* Improved README. + +Version 0.1.0 (April 21, 2015) +------------------------------ + +Initial release! diff --git a/README.md b/README.md index 908436c..096270d 100644 --- a/README.md +++ b/README.md @@ -31,26 +31,7 @@ This library is tested with Python 3.6+ but should work in earlier versions: [![Build Status](https://app.travis-ci.com/JoshData/python-email-validator.svg?branch=main)](https://app.travis-ci.com/JoshData/python-email-validator) ---- - -This library was first published in 2015. The current version is 1.2.1 -(posted May 1, 2022). The main changes in version 1.2 are: - -* Rejecting domains with NULL MX records (when deliverability checks - are turned on). -* Rejecting unsafe unicode characters. (Some of these checks you should - be doing on all of your user inputs already!) -* Rejecting most special-use reserved domain names. A new `test_environment` - option is added for using `@*.test` domains. -* Some fixes in the tests. - -Unreleased changes in development: - -* Deliverability checks now check for 'v=spf1 -all' SPF records as a way to reject more bad domains. -* Special use domain names now raise EmailSyntaxError instead of EmailUndeliverableError. -* New module-level attributes are added to override the default values of the keyword arguments and the special-use domains list. -* The keyword arguments of the public methods are now marked as keyword-only. -* [pyIsEmail](https://github.com/michaelherold/pyIsEmail)'s test cases are added to the tests. +[CHANGELOG / Release Notes](CHANGELOG.md) --- From a6664814bdbcdcf6eb4a3eca469540824a6ea6f0 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sat, 3 Sep 2022 16:32:43 -0400 Subject: [PATCH 049/174] Add undocumented globally_deliverable option for #86 --- CHANGELOG.md | 1 + email_validator/__init__.py | 33 ++++++++++++++++++--------------- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 425041c..bafb0b6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ Unreleased changes in development * The keyword arguments of the public methods are now marked as keyword-only. * [pyIsEmail](https://github.com/michaelherold/pyIsEmail)'s test cases are added to the tests. * Recommend that check_deliverability be set to False for validation on login pages. +* Added an undocumented globally_deliverable option. Version 1.2.1 (May 1, 2022) --------------------------- diff --git a/email_validator/__init__.py b/email_validator/__init__.py index d453423..b644a63 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -12,6 +12,7 @@ ALLOW_SMTPUTF8 = True CHECK_DELIVERABILITY = True TEST_ENVIRONMENT = False +GLOBALLY_DELIVERABLE = True DEFAULT_TIMEOUT = 15 # secs # Based on RFC 2822 section 3.2.4 / RFC 5322 section 3.2.3, these @@ -268,6 +269,7 @@ def validate_email( allow_empty_local=False, check_deliverability=None, test_environment=None, + globally_deliverable=GLOBALLY_DELIVERABLE, timeout=None, dns_resolver=None ): @@ -314,7 +316,7 @@ def validate_email( ret.smtputf8 = local_part_info["smtputf8"] # Validate the email address's domain part syntax and get a normalized form. - domain_part_info = validate_email_domain_part(parts[1], test_environment=test_environment) + domain_part_info = validate_email_domain_part(parts[1], test_environment=test_environment, globally_deliverable=globally_deliverable) ret.domain = domain_part_info["domain"] ret.ascii_domain = domain_part_info["ascii_domain"] @@ -473,7 +475,7 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals } -def validate_email_domain_part(domain, test_environment=False): +def validate_email_domain_part(domain, test_environment=False, globally_deliverable=True): # Empty? if len(domain) == 0: raise EmailSyntaxError("There must be something after the @-sign.") @@ -551,13 +553,20 @@ def validate_email_domain_part(domain, test_environment=False): if not m: raise EmailSyntaxError("The email address contains invalid characters after the @-sign.") - # All publicly deliverable addresses have domain named with at least - # one period, and we'll consider the lack of a period a syntax error - # since that will match people's sense of what an email address looks - # like. We'll skip this in test environments to allow '@test' email - # addresses. - if "." not in ascii_domain and not (ascii_domain == "test" and test_environment): - raise EmailSyntaxError("The domain name %s is not valid. It should have a period." % domain_i18n) + if globally_deliverable: + # All publicly deliverable addresses have domain named with at least + # one period, and we'll consider the lack of a period a syntax error + # since that will match people's sense of what an email address looks + # like. We'll skip this in test environments to allow '@test' email + # addresses. + if "." not in ascii_domain and not (ascii_domain == "test" and test_environment): + raise EmailSyntaxError("The domain name %s is not valid. It should have a period." % domain_i18n) + + # We also know that all TLDs currently end with a letter. + if not re.search(r"[A-Za-z]\Z", ascii_domain): + raise EmailSyntaxError( + "The domain name %s is not valid. It is not within a valid top-level domain." % domain_i18n + ) # Check special-use and reserved domain names. # Some might fail DNS-based deliverability checks, but that @@ -570,12 +579,6 @@ def validate_email_domain_part(domain, test_environment=False): if ascii_domain == d or ascii_domain.endswith("." + d): raise EmailSyntaxError("The domain name %s is a special-use or reserved name that cannot be used with email." % domain_i18n) - # We also know that all TLDs currently end with a letter. - if not re.search(r"[A-Za-z]\Z", ascii_domain): - raise EmailSyntaxError( - "The domain name %s is not valid. It is not within a valid top-level domain." % domain_i18n - ) - # Return the IDNA ASCII-encoded form of the domain, which is how it # would be transmitted on the wire (except when used with SMTPUTF8 # possibly), as well as the canonical Unicode form of the domain, From dd13053b6463046f6f7186a7803a2ad83ebbbd65 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Mon, 5 Sep 2022 08:45:54 -0400 Subject: [PATCH 050/174] Some README tweaks --- README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 096270d..7c8409c 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ This library is tested with Python 3.6+ but should work in earlier versions: [![Build Status](https://app.travis-ci.com/JoshData/python-email-validator.svg?branch=main)](https://app.travis-ci.com/JoshData/python-email-validator) -[CHANGELOG / Release Notes](CHANGELOG.md) +View the [CHANGELOG / Release Notes](CHANGELOG.md) for the version history of changes in the library. Occasionally this README is ahead of the latest published package --- see the CHANGELOG for details. --- @@ -111,7 +111,6 @@ The validator checks that the domain name in the email address has a DNS MX record (except a NULL MX record) indicating that it can receive email and that it does not have a reject-all SPF record (`v=spf1 -all`) which would indicate that it cannot send email. -(A/AAAA-record MX fallback is also checked.) There is nothing to be gained by trying to actually contact an SMTP server, so that's not done here. For privacy, security, and practicality reasons servers are good at not giving away whether an address is @@ -129,7 +128,7 @@ The `validate_email` function also accepts the following keyword arguments require the [SMTPUTF8](https://tools.ietf.org/html/rfc6531) extension. You can also set `email_validator.ALLOW_SMTPUTF8` to `False` to turn it off for all calls by default. -`check_deliverability=True`: Set to `False` to skip DNS record checks for the domain. It is recommended to pass `False` when performing validation for login pages since re-validation of the domain by querying DNS at every login is probably undesirable. You can also set `email_validator.CHECK_DELIVERABILITY` to `False` to turn this off for all calls by default. +`check_deliverability=True`: If true, DNS queries check that a non-null MX (or A/AAAA record as an MX fallback) is present for the domain-part of the email address and that a reject-all SPF record is not present. Set to `False` to skip these DNS checks. DNS is slow and sometimes unavailable, so consider whether these checks are useful for your use case. It is recommended to pass `False` when performing validation for login pages (but not account creation pages) since re-validation of the domain by querying DNS at every login is probably undesirable. You can also set `email_validator.CHECK_DELIVERABILITY` to `False` to turn this off for all calls by default. `allow_empty_local=False`: Set to `True` to allow an empty local part (i.e. `@example.com`), e.g. for validating Postfix aliases. From 10c34e6f07fa29c72da00b3635293ab38db0d20f Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sun, 18 Sep 2022 15:24:51 -0400 Subject: [PATCH 051/174] Version 1.3.0 --- CHANGELOG.md | 4 ++-- README.md | 14 ++++++-------- release_to_pypi.sh | 6 ++++++ setup.cfg | 4 ++-- 4 files changed, 16 insertions(+), 12 deletions(-) create mode 100755 release_to_pypi.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index bafb0b6..f1630e9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,5 @@ -Unreleased changes in development ---------------------------------- +Version 1.3.0 (September 18, 2022) +---------------------------------- * Deliverability checks now check for 'v=spf1 -all' SPF records as a way to reject more bad domains. * Special use domain names now raise EmailSyntaxError instead of EmailUndeliverableError since they are performed even if check_deliverability is off. diff --git a/README.md b/README.md index 7c8409c..14ce832 100644 --- a/README.md +++ b/README.md @@ -425,17 +425,15 @@ The package is distributed as a universal wheel and as a source package. To release: -* Update the version number. -* Follow the steps below to publish source and a universal wheel to pypi. +* Update CHANGELOG.md. +* Update the version number in setup.cfg. +* Make a commit with the new version number. +* Follow the steps below to publish source and a universal wheel to pypi and tag the release. * Make a release at https://github.com/JoshData/python-email-validator/releases/new. ```sh -pip3 install twine -rm -rf dist -python3 setup.py sdist -python3 setup.py bdist_wheel -twine upload dist/* # username: __token__ password: pypi API token -git tag v1.0.XXX # replace with version in setup.cfg +./release_to_pypi.sh +git tag v$(grep version setup.cfg | sed "s/.*= //") git push --tags ``` diff --git a/release_to_pypi.sh b/release_to_pypi.sh new file mode 100755 index 0000000..d8d5e05 --- /dev/null +++ b/release_to_pypi.sh @@ -0,0 +1,6 @@ +#!/bin/sh +pip3 install --upgrade twine +rm -rf dist +python3 setup.py sdist +python3 setup.py bdist_wheel +twine upload -u __token__ dist/* # username: __token__ password: pypi API token diff --git a/setup.cfg b/setup.cfg index c570779..05ec00e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,7 +1,7 @@ [metadata] name = email_validator -version = 1.2.1 -description = A robust email syntax and deliverability validation library. +version = 1.3.0 +description = A robust email address syntax and deliverability validation library. long_description = file: README.md long_description_content_type = text/markdown url = https://github.com/JoshData/python-email-validator From d7fd074eeee47663042730943861980495fc8225 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Mon, 17 Oct 2022 07:56:37 -0400 Subject: [PATCH 052/174] Note that Python 2.x is no longer supported (fixes #91) --- CHANGELOG.md | 2 +- README.md | 11 ++++++----- setup.cfg | 4 +--- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f1630e9..6e4861e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,7 @@ Version 1.3.0 (September 18, 2022) * Deliverability checks now check for 'v=spf1 -all' SPF records as a way to reject more bad domains. * Special use domain names now raise EmailSyntaxError instead of EmailUndeliverableError since they are performed even if check_deliverability is off. * New module-level attributes are added to override the default values of the keyword arguments and the special-use domains list. -* The keyword arguments of the public methods are now marked as keyword-only. +* The keyword arguments of the public methods are now marked as keyword-only, ending support for Python 2.x. * [pyIsEmail](https://github.com/michaelherold/pyIsEmail)'s test cases are added to the tests. * Recommend that check_deliverability be set to False for validation on login pages. * Added an undocumented globally_deliverable option. diff --git a/README.md b/README.md index 14ce832..757329e 100644 --- a/README.md +++ b/README.md @@ -252,13 +252,14 @@ part is converted to [IDNA ASCII](https://tools.ietf.org/html/rfc5891). (You probably should not do this at account creation time so you don't change the user's login information without telling them.) -### UCS-4 support required for Python 2.7 +### Support for Python 2.7 -This library hopefully still works with Python 2.7. -Note that when using Python 2.7, it is required that it was built with +The last version of this library supporting Python 2.x is version 1.2.1. + +When using Python 2.x, it is required that it was built with UCS-4 support (see -[here](https://stackoverflow.com/questions/29109944/python-returns-length-of-2-for-single-unicode-character-string)); -otherwise emails with unicode characters outside of the BMP (Basic +[here](https://stackoverflow.com/questions/29109944/python-returns-length-of-2-for-single-unicode-character-string)). +Without UCS-4 support, unicode characters outside of the BMP (Basic Multilingual Plane) will not validate correctly. Normalization diff --git a/setup.cfg b/setup.cfg index 05ec00e..c4d6f41 100644 --- a/setup.cfg +++ b/setup.cfg @@ -13,8 +13,6 @@ classifiers = Development Status :: 5 - Production/Stable Intended Audience :: Developers License :: CC0 1.0 Universal (CC0 1.0) Public Domain Dedication - Programming Language :: Python :: 2 - Programming Language :: Python :: 2.7 Programming Language :: Python :: 3 Programming Language :: Python :: 3.7 Programming Language :: Python :: 3.8 @@ -28,7 +26,7 @@ packages = find: install_requires = dnspython>=1.15.0 idna>=2.0.0 -python_requires = >=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.* +python_requires = >=3.5 [options.entry_points] console_scripts = From 2f11b627e82dca8ece4302c9b77141e82ed81770 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Fri, 30 Sep 2022 10:42:38 -0400 Subject: [PATCH 053/174] Limit the SPF reject-all check to domains without MX records that have fallback A/AAAA records Fixes #90. --- CHANGELOG.md | 5 +++++ README.md | 14 ++++++-------- email_validator/__init__.py | 38 +++++++++++++++++++------------------ tests/test_main.py | 2 +- 4 files changed, 32 insertions(+), 27 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6e4861e..2f1a0d0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +In Development +-------------- + +* The new SPF reject-all record check is now limited to domains that do not have MX records but do have an A/AAAA record fallback. + Version 1.3.0 (September 18, 2022) ---------------------------------- diff --git a/README.md b/README.md index 757329e..84c4120 100644 --- a/README.md +++ b/README.md @@ -109,8 +109,7 @@ later in the document about that.) The validator checks that the domain name in the email address has a DNS MX record (except a NULL MX record) indicating that it can receive -email and that it does not have a reject-all SPF record (`v=spf1 -all`) -which would indicate that it cannot send email. +email (or a fallback A-record, see below). There is nothing to be gained by trying to actually contact an SMTP server, so that's not done here. For privacy, security, and practicality reasons servers are good at not giving away whether an address is @@ -128,7 +127,7 @@ The `validate_email` function also accepts the following keyword arguments require the [SMTPUTF8](https://tools.ietf.org/html/rfc6531) extension. You can also set `email_validator.ALLOW_SMTPUTF8` to `False` to turn it off for all calls by default. -`check_deliverability=True`: If true, DNS queries check that a non-null MX (or A/AAAA record as an MX fallback) is present for the domain-part of the email address and that a reject-all SPF record is not present. Set to `False` to skip these DNS checks. DNS is slow and sometimes unavailable, so consider whether these checks are useful for your use case. It is recommended to pass `False` when performing validation for login pages (but not account creation pages) since re-validation of the domain by querying DNS at every login is probably undesirable. You can also set `email_validator.CHECK_DELIVERABILITY` to `False` to turn this off for all calls by default. +`check_deliverability=True`: If true, a DNS query is made to check that a non-null MX record is present for the domain-part of the email address (or if not, an A/AAAA record as an MX fallback can be present but in that case a reject-all SPF record must not be present). Set to `False` to skip this DNS-based check. DNS is slow and sometimes unavailable, so consider whether these checks are useful for your use case. It is recommended to pass `False` when performing validation for login pages (but not account creation pages) since re-validation of a previously validated domain in your database by querying DNS at every login is probably undesirable. You can also set `email_validator.CHECK_DELIVERABILITY` to `False` to turn this off for all calls by default. `allow_empty_local=False`: Set to `True` to allow an empty local part (i.e. `@example.com`), e.g. for validating Postfix aliases. @@ -382,7 +381,7 @@ are: | `smtputf8` | A boolean indicating that the [SMTPUTF8](https://tools.ietf.org/html/rfc6531) feature of your mail relay will be required to transmit messages to this address because the local part of the address has non-ASCII characters (the local part cannot be IDNA-encoded). If `allow_smtputf8=False` is passed as an argument, this flag will always be false because an exception is raised if it would have been true. | | `mx` | A list of (priority, domain) tuples of MX records specified in the DNS for the domain (see [RFC 5321 section 5](https://tools.ietf.org/html/rfc5321#section-5)). May be `None` if the deliverability check could not be completed because of a temporary issue like a timeout. | | `mx_fallback_type` | `None` if an `MX` record is found. If no MX records are actually specified in DNS and instead are inferred, through an obsolete mechanism, from A or AAAA records, the value is the type of DNS record used instead (`A` or `AAAA`). May be `None` if the deliverability check could not be completed because of a temporary issue like a timeout. | -| `spf` | Any SPF record found while checking deliverability. | +| `spf` | Any SPF record found while checking deliverability. Only set if the SPF record is queried. | Assumptions ----------- @@ -394,11 +393,10 @@ or likely to cause trouble: * The validator assumes the email address is intended to be usable on the public Internet. The domain part of the email address must be a resolvable domain name - (without NULL MX or SPF -all DNS records) if deliverability - checks are turned on. + (see the deliverability checks described above). Most [Special Use Domain Names](https://www.iana.org/assignments/special-use-domain-names/special-use-domain-names.xhtml) - and their subdomains and - domain names without a `.` are rejected as a syntax error + and their subdomains, as well as + domain names without a `.`, are rejected as a syntax error (except see the `test_environment` parameter above). * Obsolete email syntaxes are rejected: The "quoted string" form of the local part of the email address (RFC diff --git a/email_validator/__init__.py b/email_validator/__init__.py index b644a63..c86584a 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -648,7 +648,7 @@ def dns_resolver_resolve_shim(domain, record): except (dns.resolver.NoNameservers, dns.resolver.NXDOMAIN, dns.resolver.NoAnswer): - # If there was no MX record, fall back to an A record. + # If there was no MX record, fall back to an A record, as SMTP servers do. try: response = dns_resolver_resolve_shim(domain, "A") deliverability_info["mx"] = [(0, str(r)) for r in response] @@ -666,23 +666,25 @@ def dns_resolver_resolve_shim(domain, record): # this domain is not deliverable. raise EmailUndeliverableError("The domain name %s does not exist." % domain_i18n) - try: - # Check for a SPF reject all ("v=spf1 -all") record which indicates - # no emails are sent from this domain, which like a NULL MX record - # would indicate that the domain is not used for email. - response = dns_resolver_resolve_shim(domain, "TXT") - for rec in response: - value = b"".join(rec.strings) - if value.startswith(b"v=spf1 "): - deliverability_info["spf"] = value.decode("ascii", errors='replace') - if value == b"v=spf1 -all": - raise EmailUndeliverableError("The domain name %s does not send email." % domain_i18n) - except dns.resolver.NoAnswer: - # No TXT records means there is no SPF policy, so we cannot take any action. - pass - except (dns.resolver.NoNameservers, dns.resolver.NXDOMAIN): - # Failure to resolve at this step will be ignored. - pass + # Check for a SPF reject-all record ("v=spf1 -all") which indicates + # no emails are sent from this domain (similar to a NULL MX record + # but for sending rather than receiving). In combination with the + # absence of an MX record, this is probably a good sign that the + # domain is not used for email. + try: + response = dns_resolver_resolve_shim(domain, "TXT") + for rec in response: + value = b"".join(rec.strings) + if value.startswith(b"v=spf1 "): + deliverability_info["spf"] = value.decode("ascii", errors='replace') + if value == b"v=spf1 -all": + raise EmailUndeliverableError("The domain name %s does not send email." % domain_i18n) + except dns.resolver.NoAnswer: + # No TXT records means there is no SPF policy, so we cannot take any action. + pass + except (dns.resolver.NoNameservers, dns.resolver.NXDOMAIN): + # Failure to resolve at this step will be ignored. + pass except dns.exception.Timeout: # A timeout could occur for various reasons, so don't treat it as a failure. diff --git a/tests/test_main.py b/tests/test_main.py index 1ca6d76..adcbc1e 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -528,7 +528,7 @@ def test_dict_accessor(): def test_deliverability_found(): response = validate_email_deliverability('gmail.com', 'gmail.com') - assert response.keys() == {'mx', 'mx_fallback_type', 'spf'} + assert response.keys() == {'mx', 'mx_fallback_type'} assert response['mx_fallback_type'] is None assert len(response['mx']) > 1 assert len(response['mx'][0]) == 2 From c6722e12b7240013af0848a79e5ebe47aedf823a Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sat, 21 Jan 2023 05:42:06 -0500 Subject: [PATCH 054/174] Version 1.3.1 --- CHANGELOG.md | 6 +++--- README.md | 5 +++-- setup.cfg | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2f1a0d0..91738d5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,7 @@ -In Development --------------- +Version 1.3.1 (January 21, 2023) +-------------------------------- -* The new SPF reject-all record check is now limited to domains that do not have MX records but do have an A/AAAA record fallback. +* The new SPF 'v=spf1 -all' (reject-all) deliverability check is removed in most cases. It now is performed only for domains that do not have MX records but do have an A/AAAA fallback record. Version 1.3.0 (September 18, 2022) ---------------------------------- diff --git a/README.md b/README.md index 84c4120..241a809 100644 --- a/README.md +++ b/README.md @@ -426,9 +426,10 @@ To release: * Update CHANGELOG.md. * Update the version number in setup.cfg. -* Make a commit with the new version number. -* Follow the steps below to publish source and a universal wheel to pypi and tag the release. +* Make & push a commit with the new version number. +* Make & push a tag (`git tag v... && git push --tags`). * Make a release at https://github.com/JoshData/python-email-validator/releases/new. +* Follow the steps below to publish source and a universal wheel to pypi. ```sh ./release_to_pypi.sh diff --git a/setup.cfg b/setup.cfg index c4d6f41..6a92d0a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = email_validator -version = 1.3.0 +version = 1.3.1 description = A robust email address syntax and deliverability validation library. long_description = file: README.md long_description_content_type = text/markdown From ab293fd0d4a66e9d89799d165e0c75a7c034121b Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Thu, 29 Sep 2022 18:15:50 -0400 Subject: [PATCH 055/174] Reorganize the library into smaller modules There are no logic changes in this commit, just moving code. --- CHANGELOG.md | 5 + email_validator/__init__.py | 673 +--------------------------- email_validator/__main__.py | 53 +++ email_validator/deliverability.py | 131 ++++++ email_validator/exceptions_types.py | 122 +++++ email_validator/rfc_constants.py | 39 ++ email_validator/syntax.py | 232 ++++++++++ email_validator/validate_email.py | 132 ++++++ tests/test_main.py | 17 +- 9 files changed, 730 insertions(+), 674 deletions(-) create mode 100644 email_validator/__main__.py create mode 100644 email_validator/deliverability.py create mode 100644 email_validator/exceptions_types.py create mode 100644 email_validator/rfc_constants.py create mode 100644 email_validator/syntax.py create mode 100644 email_validator/validate_email.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 91738d5..58c1128 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +In Development +-------------- + +* The library has been reorganized internally into smaller modules. + Version 1.3.1 (January 21, 2023) -------------------------------- diff --git a/email_validator/__init__.py b/email_validator/__init__.py index c86584a..6a91fda 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -1,11 +1,12 @@ # -*- coding: utf-8 -*- -import sys -import re -import unicodedata -import dns.resolver -import dns.exception -import idna # implements IDNA 2008; Python's codec is only IDNA 2003 +# Export the main method, helper methods, and the public data types. +from .validate_email import validate_email # noqa: F401 +from .deliverability import caching_resolver # noqa: F401 +from .exceptions_types import * # noqa: F401,F403 + +# These global attributes are a part of the library's API and can be +# changed by library users. # Default values for keyword arguments. @@ -15,34 +16,6 @@ GLOBALLY_DELIVERABLE = True DEFAULT_TIMEOUT = 15 # secs -# Based on RFC 2822 section 3.2.4 / RFC 5322 section 3.2.3, these -# characters are permitted in email addresses (not taking into -# account internationalization): -ATEXT = r'a-zA-Z0-9_!#\$%&\'\*\+\-/=\?\^`\{\|\}~' - -# A "dot atom text", per RFC 2822 3.2.4: -DOT_ATOM_TEXT = '[' + ATEXT + ']+(?:\\.[' + ATEXT + ']+)*' - -# RFC 6531 section 3.3 extends the allowed characters in internationalized -# addresses to also include three specific ranges of UTF8 defined in -# RFC3629 section 4, which appear to be the Unicode code points from -# U+0080 to U+10FFFF. -ATEXT_INTL = ATEXT + u"\u0080-\U0010FFFF" -DOT_ATOM_TEXT_INTL = '[' + ATEXT_INTL + ']+(?:\\.[' + ATEXT_INTL + ']+)*' - -# The domain part of the email address, after IDNA (ASCII) encoding, -# must also satisfy the requirements of RFC 952/RFC 1123 which restrict -# the allowed characters of hostnames further. The hyphen cannot be at -# the beginning or end of a *dot-atom component* of a hostname either. -ATEXT_HOSTNAME = r'(?:(?:[a-zA-Z0-9][a-zA-Z0-9\-]*)?[a-zA-Z0-9])' - -# Length constants -# RFC 3696 + errata 1003 + errata 1690 (https://www.rfc-editor.org/errata_search.php?rfc=3696&eid=1690) -# explains the maximum length of an email address is 254 octets. -EMAIL_MAX_LENGTH = 254 -LOCAL_PART_MAX_LENGTH = 64 -DOMAIN_MAX_LENGTH = 255 - # IANA Special Use Domain Names # Last Updated 2021-09-21 # https://www.iana.org/assignments/special-use-domain-names/special-use-domain-names.txt @@ -107,635 +80,3 @@ # fail deliverability checks because "test" is not an actual TLD. "test", ] - -# ease compatibility in type checking -if sys.version_info >= (3,): - unicode_class = str -else: - unicode_class = unicode # noqa: F821 - - # turn regexes to unicode (because 'ur' literals are not allowed in Py3) - ATEXT = ATEXT.decode("ascii") - DOT_ATOM_TEXT = DOT_ATOM_TEXT.decode("ascii") - ATEXT_HOSTNAME = ATEXT_HOSTNAME.decode("ascii") - - -class EmailNotValidError(ValueError): - """Parent class of all exceptions raised by this module.""" - pass - - -class EmailSyntaxError(EmailNotValidError): - """Exception raised when an email address fails validation because of its form.""" - pass - - -class EmailUndeliverableError(EmailNotValidError): - """Exception raised when an email address fails validation because its domain name does not appear deliverable.""" - pass - - -class ValidatedEmail(object): - """The validate_email function returns objects of this type holding the normalized form of the email address - and other information.""" - - """The email address that was passed to validate_email. (If passed as bytes, this will be a string.)""" - original_email = None - - """The normalized email address, which should always be used in preferance to the original address. - The normalized address converts an IDNA ASCII domain name to Unicode, if possible, and performs - Unicode normalization on the local part and on the domain (if originally Unicode). It is the - concatenation of the local_part and domain attributes, separated by an @-sign.""" - email = None - - """The local part of the email address after Unicode normalization.""" - local_part = None - - """The domain part of the email address after Unicode normalization or conversion to - Unicode from IDNA ascii.""" - domain = None - - """If not None, a form of the email address that uses 7-bit ASCII characters only.""" - ascii_email = None - - """If not None, the local part of the email address using 7-bit ASCII characters only.""" - ascii_local_part = None - - """If not None, a form of the domain name that uses 7-bit ASCII characters only.""" - ascii_domain = None - - """If True, the SMTPUTF8 feature of your mail relay will be required to transmit messages - to this address. This flag is True just when ascii_local_part is missing. Otherwise it - is False.""" - smtputf8 = None - - """If a deliverability check is performed and if it succeeds, a list of (priority, domain) - tuples of MX records specified in the DNS for the domain.""" - mx = None - - """If no MX records are actually specified in DNS and instead are inferred, through an obsolete - mechanism, from A or AAAA records, the value is the type of DNS record used instead (`A` or `AAAA`).""" - mx_fallback_type = None - - """Tests use this constructor.""" - def __init__(self, **kwargs): - for k, v in kwargs.items(): - setattr(self, k, v) - - """As a convenience, str(...) on instances of this class return the normalized address.""" - def __self__(self): - return self.normalized_email - - def __repr__(self): - return "".format(self.email) - - """For backwards compatibility, some fields are also exposed through a dict-like interface. Note - that some of the names changed when they became attributes.""" - def __getitem__(self, key): - if key == "email": - return self.email - if key == "email_ascii": - return self.ascii_email - if key == "local": - return self.local_part - if key == "domain": - return self.ascii_domain - if key == "domain_i18n": - return self.domain - if key == "smtputf8": - return self.smtputf8 - if key == "mx": - return self.mx - if key == "mx-fallback": - return self.mx_fallback_type - raise KeyError() - - """Tests use this.""" - def __eq__(self, other): - if not isinstance(other, ValidatedEmail): - return False - return ( - self.email == other.email - and self.local_part == other.local_part - and self.domain == other.domain - and self.ascii_email == other.ascii_email - and self.ascii_local_part == other.ascii_local_part - and self.ascii_domain == other.ascii_domain - and self.smtputf8 == other.smtputf8 - and repr(sorted(self.mx) if self.mx else self.mx) - == repr(sorted(other.mx) if other.mx else other.mx) - and self.mx_fallback_type == other.mx_fallback_type - ) - - """This helps producing the README.""" - def as_constructor(self): - return "ValidatedEmail(" \ - + ",".join("\n {}={}".format( - key, - repr(getattr(self, key))) - for key in ('email', 'local_part', 'domain', - 'ascii_email', 'ascii_local_part', 'ascii_domain', - 'smtputf8', 'mx', 'mx_fallback_type') - ) \ - + ")" - - """Convenience method for accessing ValidatedEmail as a dict""" - def as_dict(self): - return self.__dict__ - - -def __get_length_reason(addr, utf8=False, limit=EMAIL_MAX_LENGTH): - diff = len(addr) - limit - reason = "({}{} character{} too many)" - prefix = "at least " if utf8 else "" - suffix = "s" if diff > 1 else "" - return reason.format(prefix, diff, suffix) - - -def caching_resolver(*, timeout=None, cache=None): - if timeout is None: - timeout = DEFAULT_TIMEOUT - resolver = dns.resolver.Resolver() - resolver.cache = cache or dns.resolver.LRUCache() - resolver.lifetime = timeout # timeout, in seconds - return resolver - - -def validate_email( - email, - # /, # not supported in Python 3.6, 3.7 - *, - allow_smtputf8=None, - allow_empty_local=False, - check_deliverability=None, - test_environment=None, - globally_deliverable=GLOBALLY_DELIVERABLE, - timeout=None, - dns_resolver=None -): - """ - Validates an email address, raising an EmailNotValidError if the address is not valid or returning a dict of - information when the address is valid. The email argument can be a str or a bytes instance, - but if bytes it must be ASCII-only. - """ - - # Fill in default values of arguments. - if allow_smtputf8 is None: - allow_smtputf8 = ALLOW_SMTPUTF8 - if check_deliverability is None: - check_deliverability = CHECK_DELIVERABILITY - if test_environment is None: - test_environment = TEST_ENVIRONMENT - if timeout is None: - timeout = DEFAULT_TIMEOUT - - # Allow email to be a str or bytes instance. If bytes, - # it must be ASCII because that's how the bytes work - # on the wire with SMTP. - if not isinstance(email, (str, unicode_class)): - try: - email = email.decode("ascii") - except ValueError: - raise EmailSyntaxError("The email address is not valid ASCII.") - - # At-sign. - parts = email.split('@') - if len(parts) != 2: - raise EmailSyntaxError("The email address is not valid. It must have exactly one @-sign.") - - # Collect return values in this instance. - ret = ValidatedEmail() - ret.original_email = email - - # Validate the email address's local part syntax and get a normalized form. - local_part_info = validate_email_local_part(parts[0], - allow_smtputf8=allow_smtputf8, - allow_empty_local=allow_empty_local) - ret.local_part = local_part_info["local_part"] - ret.ascii_local_part = local_part_info["ascii_local_part"] - ret.smtputf8 = local_part_info["smtputf8"] - - # Validate the email address's domain part syntax and get a normalized form. - domain_part_info = validate_email_domain_part(parts[1], test_environment=test_environment, globally_deliverable=globally_deliverable) - ret.domain = domain_part_info["domain"] - ret.ascii_domain = domain_part_info["ascii_domain"] - - # Construct the complete normalized form. - ret.email = ret.local_part + "@" + ret.domain - - # If the email address has an ASCII form, add it. - if not ret.smtputf8: - ret.ascii_email = ret.ascii_local_part + "@" + ret.ascii_domain - - # If the email address has an ASCII representation, then we assume it may be - # transmitted in ASCII (we can't assume SMTPUTF8 will be used on all hops to - # the destination) and the length limit applies to ASCII characters (which is - # the same as octets). The number of characters in the internationalized form - # may be many fewer (because IDNA ASCII is verbose) and could be less than 254 - # Unicode characters, and of course the number of octets over the limit may - # not be the number of characters over the limit, so if the email address is - # internationalized, we can't give any simple information about why the address - # is too long. - # - # In addition, check that the UTF-8 encoding (i.e. not IDNA ASCII and not - # Unicode characters) is at most 254 octets. If the addres is transmitted using - # SMTPUTF8, then the length limit probably applies to the UTF-8 encoded octets. - # If the email address has an ASCII form that differs from its internationalized - # form, I don't think the internationalized form can be longer, and so the ASCII - # form length check would be sufficient. If there is no ASCII form, then we have - # to check the UTF-8 encoding. The UTF-8 encoding could be up to about four times - # longer than the number of characters. - # - # See the length checks on the local part and the domain. - if ret.ascii_email and len(ret.ascii_email) > EMAIL_MAX_LENGTH: - if ret.ascii_email == ret.email: - reason = __get_length_reason(ret.ascii_email) - elif len(ret.email) > EMAIL_MAX_LENGTH: - # If there are more than 254 characters, then the ASCII - # form is definitely going to be too long. - reason = __get_length_reason(ret.email, utf8=True) - else: - reason = "(when converted to IDNA ASCII)" - raise EmailSyntaxError("The email address is too long {}.".format(reason)) - if len(ret.email.encode("utf8")) > EMAIL_MAX_LENGTH: - if len(ret.email) > EMAIL_MAX_LENGTH: - # If there are more than 254 characters, then the UTF-8 - # encoding is definitely going to be too long. - reason = __get_length_reason(ret.email, utf8=True) - else: - reason = "(when encoded in bytes)" - raise EmailSyntaxError("The email address is too long {}.".format(reason)) - - if check_deliverability and not test_environment: - # Validate the email address's deliverability using DNS - # and update the return dict with metadata. - deliverability_info = validate_email_deliverability( - ret["domain"], ret["domain_i18n"], timeout, dns_resolver - ) - for key, value in deliverability_info.items(): - setattr(ret, key, value) - - return ret - - -def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=False): - # Validates the local part of an email address. - - if len(local) == 0: - if not allow_empty_local: - raise EmailSyntaxError("There must be something before the @-sign.") - else: - # The caller allows an empty local part. Useful for validating certain - # Postfix aliases. - return { - "local_part": local, - "ascii_local_part": local, - "smtputf8": False, - } - - # RFC 5321 4.5.3.1.1 - # We're checking the number of characters here. If the local part - # is ASCII-only, then that's the same as bytes (octets). If it's - # internationalized, then the UTF-8 encoding may be longer, but - # that may not be relevant. We will check the total address length - # instead. - if len(local) > LOCAL_PART_MAX_LENGTH: - reason = __get_length_reason(local, limit=LOCAL_PART_MAX_LENGTH) - raise EmailSyntaxError("The email address is too long before the @-sign {}.".format(reason)) - - # Check the local part against the regular expression for the older ASCII requirements. - m = re.match(DOT_ATOM_TEXT + "\\Z", local) - if m: - # Return the local part unchanged and flag that SMTPUTF8 is not needed. - return { - "local_part": local, - "ascii_local_part": local, - "smtputf8": False, - } - - else: - # The local part failed the ASCII check. Now try the extended internationalized requirements. - m = re.match(DOT_ATOM_TEXT_INTL + "\\Z", local) - if not m: - # It's not a valid internationalized address either. Report which characters were not valid. - bad_chars = ', '.join(sorted(set( - unicodedata.name(c, repr(c)) for c in local if not re.match(u"[" + (ATEXT if not allow_smtputf8 else ATEXT_INTL) + u"]", c) - ))) - raise EmailSyntaxError("The email address contains invalid characters before the @-sign: %s." % bad_chars) - - # It would be valid if internationalized characters were allowed by the caller. - if not allow_smtputf8: - raise EmailSyntaxError("Internationalized characters before the @-sign are not supported.") - - # It's valid. - - # RFC 6532 section 3.1 also says that Unicode NFC normalization should be applied, - # so we'll return the normalized local part in the return value. - local = unicodedata.normalize("NFC", local) - - # Check for unsafe characters. - # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked - # by DOT_ATOM_TEXT_INTL. - for i, c in enumerate(local): - category = unicodedata.category(c) - if category[0] in ("L", "N", "P", "S"): - # letters, numbers, punctuation, and symbols are permitted - pass - elif category[0] == "M": - # combining character in first position would combine with something - # outside of the email address if concatenated to the right, but are - # otherwise permitted - if i == 0: - raise EmailSyntaxError("The email address contains an initial invalid character (%s)." - % unicodedata.name(c, repr(c))) - elif category[0] in ("Z", "C"): - # spaces and line/paragraph characters (Z) and - # control, format, surrogate, private use, and unassigned code points (C) - raise EmailSyntaxError("The email address contains an invalid character (%s)." - % unicodedata.name(c, repr(c))) - else: - # All categories should be handled above, but in case there is something new - # in the future. - raise EmailSyntaxError("The email address contains a character (%s; category %s) that may not be safe." - % (unicodedata.name(c, repr(c)), category)) - - # Try encoding to UTF-8. Failure is possible with some characters like - # surrogate code points, but those are checked above. Still, we don't - # want to have an unhandled exception later. - try: - local.encode("utf8") - except ValueError: - raise EmailSyntaxError("The email address contains an invalid character.") - - # Flag that SMTPUTF8 will be required for deliverability. - return { - "local_part": local, - "ascii_local_part": None, # no ASCII form is possible - "smtputf8": True, - } - - -def validate_email_domain_part(domain, test_environment=False, globally_deliverable=True): - # Empty? - if len(domain) == 0: - raise EmailSyntaxError("There must be something after the @-sign.") - - # Perform UTS-46 normalization, which includes casefolding, NFC normalization, - # and converting all label separators (the period/full stop, fullwidth full stop, - # ideographic full stop, and halfwidth ideographic full stop) to basic periods. - # It will also raise an exception if there is an invalid character in the input, - # such as "⒈" which is invalid because it would expand to include a period. - try: - domain = idna.uts46_remap(domain, std3_rules=False, transitional=False) - except idna.IDNAError as e: - raise EmailSyntaxError("The domain name %s contains invalid characters (%s)." % (domain, str(e))) - - # Now we can perform basic checks on the use of periods (since equivalent - # symbols have been mapped to periods). These checks are needed because the - # IDNA library doesn't handle well domains that have empty labels (i.e. initial - # dot, trailing dot, or two dots in a row). - if domain.endswith("."): - raise EmailSyntaxError("An email address cannot end with a period.") - if domain.startswith("."): - raise EmailSyntaxError("An email address cannot have a period immediately after the @-sign.") - if ".." in domain: - raise EmailSyntaxError("An email address cannot have two periods in a row.") - - # Regardless of whether international characters are actually used, - # first convert to IDNA ASCII. For ASCII-only domains, the transformation - # does nothing. If internationalized characters are present, the MTA - # must either support SMTPUTF8 or the mail client must convert the - # domain name to IDNA before submission. - # - # Unfortunately this step incorrectly 'fixes' domain names with leading - # periods by removing them, so we have to check for this above. It also gives - # a funky error message ("No input") when there are two periods in a - # row, also checked separately above. - try: - ascii_domain = idna.encode(domain, uts46=False).decode("ascii") - except idna.IDNAError as e: - if "Domain too long" in str(e): - # We can't really be more specific because UTS-46 normalization means - # the length check is applied to a string that is different from the - # one the user supplied. Also I'm not sure if the length check applies - # to the internationalized form, the IDNA ASCII form, or even both! - raise EmailSyntaxError("The email address is too long after the @-sign.") - raise EmailSyntaxError("The domain name %s contains invalid characters (%s)." % (domain, str(e))) - - # We may have been given an IDNA ASCII domain to begin with. Check - # that the domain actually conforms to IDNA. It could look like IDNA - # but not be actual IDNA. For ASCII-only domains, the conversion out - # of IDNA just gives the same thing back. - # - # This gives us the canonical internationalized form of the domain, - # which we should use in all error messages. - try: - domain_i18n = idna.decode(ascii_domain.encode('ascii')) - except idna.IDNAError as e: - raise EmailSyntaxError("The domain name %s is not valid IDNA (%s)." % (ascii_domain, str(e))) - - # RFC 5321 4.5.3.1.2 - # We're checking the number of bytes (octets) here, which can be much - # higher than the number of characters in internationalized domains, - # on the assumption that the domain may be transmitted without SMTPUTF8 - # as IDNA ASCII. This is also checked by idna.encode, so this exception - # is never reached. - if len(ascii_domain) > DOMAIN_MAX_LENGTH: - raise EmailSyntaxError("The email address is too long after the @-sign.") - - # A "dot atom text", per RFC 2822 3.2.4, but using the restricted - # characters allowed in a hostname (see ATEXT_HOSTNAME above). - DOT_ATOM_TEXT = ATEXT_HOSTNAME + r'(?:\.' + ATEXT_HOSTNAME + r')*' - - # Check the regular expression. This is probably entirely redundant - # with idna.decode, which also checks this format. - m = re.match(DOT_ATOM_TEXT + "\\Z", ascii_domain) - if not m: - raise EmailSyntaxError("The email address contains invalid characters after the @-sign.") - - if globally_deliverable: - # All publicly deliverable addresses have domain named with at least - # one period, and we'll consider the lack of a period a syntax error - # since that will match people's sense of what an email address looks - # like. We'll skip this in test environments to allow '@test' email - # addresses. - if "." not in ascii_domain and not (ascii_domain == "test" and test_environment): - raise EmailSyntaxError("The domain name %s is not valid. It should have a period." % domain_i18n) - - # We also know that all TLDs currently end with a letter. - if not re.search(r"[A-Za-z]\Z", ascii_domain): - raise EmailSyntaxError( - "The domain name %s is not valid. It is not within a valid top-level domain." % domain_i18n - ) - - # Check special-use and reserved domain names. - # Some might fail DNS-based deliverability checks, but that - # can be turned off, so we should fail them all sooner. - for d in SPECIAL_USE_DOMAIN_NAMES: - # See the note near the definition of SPECIAL_USE_DOMAIN_NAMES. - if d == "test" and test_environment: - continue - - if ascii_domain == d or ascii_domain.endswith("." + d): - raise EmailSyntaxError("The domain name %s is a special-use or reserved name that cannot be used with email." % domain_i18n) - - # Return the IDNA ASCII-encoded form of the domain, which is how it - # would be transmitted on the wire (except when used with SMTPUTF8 - # possibly), as well as the canonical Unicode form of the domain, - # which is better for display purposes. This should also take care - # of RFC 6532 section 3.1's suggestion to apply Unicode NFC - # normalization to addresses. - return { - "ascii_domain": ascii_domain, - "domain": domain_i18n, - } - - -def validate_email_deliverability(domain, domain_i18n, timeout=DEFAULT_TIMEOUT, dns_resolver=None): - # Check that the domain resolves to an MX record. If there is no MX record, - # try an A or AAAA record which is a deprecated fallback for deliverability. - # (Note that changing the DEFAULT_TIMEOUT module-level attribute - # will not change the default value of this method's timeout argument.) - - # If no dns.resolver.Resolver was given, get dnspython's default resolver. - # Override the default resolver's timeout. This may affect other uses of - # dnspython in this process. - if dns_resolver is None: - dns_resolver = dns.resolver.get_default_resolver() - dns_resolver.lifetime = timeout - - deliverability_info = {} - - def dns_resolver_resolve_shim(domain, record): - try: - # dns.resolver.Resolver.resolve is new to dnspython 2.x. - # https://dnspython.readthedocs.io/en/latest/resolver-class.html#dns.resolver.Resolver.resolve - return dns_resolver.resolve(domain, record) - except AttributeError: - # dnspython 2.x is only available in Python 3.6 and later. For earlier versions - # of Python, we maintain compatibility with dnspython 1.x which has a - # dnspython.resolver.Resolver.query method instead. The only difference is that - # query may treat the domain as relative and use the system's search domains, - # which we prevent by adding a "." to the domain name to make it absolute. - # dns.resolver.Resolver.query is deprecated in dnspython version 2.x. - # https://dnspython.readthedocs.io/en/latest/resolver-class.html#dns.resolver.Resolver.query - return dns_resolver.query(domain + ".", record) - - try: - # We need a way to check how timeouts are handled in the tests. So we - # have a secret variable that if set makes this method always test the - # handling of a timeout. - if getattr(validate_email_deliverability, 'TEST_CHECK_TIMEOUT', False): - raise dns.exception.Timeout() - - try: - # Try resolving for MX records. - response = dns_resolver_resolve_shim(domain, "MX") - - # For reporting, put them in priority order and remove the trailing dot in the qnames. - mtas = sorted([(r.preference, str(r.exchange).rstrip('.')) for r in response]) - - # Remove "null MX" records from the list (their value is (0, ".") but we've stripped - # trailing dots, so the 'exchange' is just ""). If there was only a null MX record, - # email is not deliverable. - mtas = [(preference, exchange) for preference, exchange in mtas - if exchange != ""] - if len(mtas) == 0: - raise EmailUndeliverableError("The domain name %s does not accept email." % domain_i18n) - - deliverability_info["mx"] = mtas - deliverability_info["mx_fallback_type"] = None - - except (dns.resolver.NoNameservers, dns.resolver.NXDOMAIN, dns.resolver.NoAnswer): - - # If there was no MX record, fall back to an A record, as SMTP servers do. - try: - response = dns_resolver_resolve_shim(domain, "A") - deliverability_info["mx"] = [(0, str(r)) for r in response] - deliverability_info["mx_fallback_type"] = "A" - except (dns.resolver.NoNameservers, dns.resolver.NXDOMAIN, dns.resolver.NoAnswer): - - # If there was no A record, fall back to an AAAA record. - try: - response = dns_resolver_resolve_shim(domain, "AAAA") - deliverability_info["mx"] = [(0, str(r)) for r in response] - deliverability_info["mx_fallback_type"] = "AAAA" - except (dns.resolver.NoNameservers, dns.resolver.NXDOMAIN, dns.resolver.NoAnswer): - - # If there was no MX, A, or AAAA record, then mail to - # this domain is not deliverable. - raise EmailUndeliverableError("The domain name %s does not exist." % domain_i18n) - - # Check for a SPF reject-all record ("v=spf1 -all") which indicates - # no emails are sent from this domain (similar to a NULL MX record - # but for sending rather than receiving). In combination with the - # absence of an MX record, this is probably a good sign that the - # domain is not used for email. - try: - response = dns_resolver_resolve_shim(domain, "TXT") - for rec in response: - value = b"".join(rec.strings) - if value.startswith(b"v=spf1 "): - deliverability_info["spf"] = value.decode("ascii", errors='replace') - if value == b"v=spf1 -all": - raise EmailUndeliverableError("The domain name %s does not send email." % domain_i18n) - except dns.resolver.NoAnswer: - # No TXT records means there is no SPF policy, so we cannot take any action. - pass - except (dns.resolver.NoNameservers, dns.resolver.NXDOMAIN): - # Failure to resolve at this step will be ignored. - pass - - except dns.exception.Timeout: - # A timeout could occur for various reasons, so don't treat it as a failure. - return { - "unknown-deliverability": "timeout", - } - - except EmailUndeliverableError: - # Don't let these get clobbered by the wider except block below. - raise - - except Exception as e: - # Unhandled conditions should not propagate. - raise EmailUndeliverableError( - "There was an error while checking if the domain name in the email address is deliverable: " + str(e) - ) - - return deliverability_info - - -def main(): - import json - - def __utf8_input_shim(input_str): - if sys.version_info < (3,): - return input_str.decode("utf-8") - return input_str - - def __utf8_output_shim(output_str): - if sys.version_info < (3,): - return unicode_class(output_str).encode("utf-8") - return output_str - - if len(sys.argv) == 1: - # Validate the email addresses pased line-by-line on STDIN. - dns_resolver = caching_resolver() - for line in sys.stdin: - email = __utf8_input_shim(line.strip()) - try: - validate_email(email, dns_resolver=dns_resolver) - except EmailNotValidError as e: - print(__utf8_output_shim("{} {}".format(email, e))) - else: - # Validate the email address passed on the command line. - email = __utf8_input_shim(sys.argv[1]) - try: - result = validate_email(email) - print(json.dumps(result.as_dict(), indent=2, sort_keys=True, ensure_ascii=False)) - except EmailNotValidError as e: - print(__utf8_output_shim(e)) - - -if __name__ == "__main__": - main() diff --git a/email_validator/__main__.py b/email_validator/__main__.py new file mode 100644 index 0000000..e9a5ea7 --- /dev/null +++ b/email_validator/__main__.py @@ -0,0 +1,53 @@ +# A command-line tool for testing. +# +# Usage: +# +# python -m email_validator +# +# Provide email addresses to validate either as a command-line argument +# or in STDIN separated by newlines. No output will be given for valid +# email addresses. Validation errors will be printed for invalid email +# addresses. + +import json +import sys + +from .validate_email import validate_email +from .deliverability import caching_resolver +from .exceptions_types import EmailNotValidError + + +def __utf8_input_shim(input_str): + if sys.version_info < (3,): + return input_str.decode("utf-8") + return input_str + + +def __utf8_output_shim(output_str): + if sys.version_info < (3,): + return unicode_class(output_str).encode("utf-8") + return output_str + + +def main(): + if len(sys.argv) == 1: + # Validate the email addresses pased line-by-line on STDIN. + dns_resolver = caching_resolver() + for line in sys.stdin: + email = __utf8_input_shim(line.strip()) + try: + validate_email(email, dns_resolver=dns_resolver) + except EmailNotValidError as e: + print(__utf8_output_shim("{} {}".format(email, e))) + else: + # Validate the email address passed on the command line. + email = __utf8_input_shim(sys.argv[1]) + try: + result = validate_email(email) + print(json.dumps(result.as_dict(), indent=2, sort_keys=True, ensure_ascii=False)) + except EmailNotValidError as e: + print(__utf8_output_shim(e)) + + +if __name__ == "__main__": + main() diff --git a/email_validator/deliverability.py b/email_validator/deliverability.py new file mode 100644 index 0000000..2b4ec96 --- /dev/null +++ b/email_validator/deliverability.py @@ -0,0 +1,131 @@ +from .exceptions_types import EmailUndeliverableError + +import dns.resolver +import dns.exception + + +def caching_resolver(*, timeout=None, cache=None): + if timeout is None: + from . import DEFAULT_TIMEOUT + timeout = DEFAULT_TIMEOUT + resolver = dns.resolver.Resolver() + resolver.cache = cache or dns.resolver.LRUCache() + resolver.lifetime = timeout # timeout, in seconds + return resolver + + +def validate_email_deliverability(domain, domain_i18n, timeout=None, dns_resolver=None): + # Check that the domain resolves to an MX record. If there is no MX record, + # try an A or AAAA record which is a deprecated fallback for deliverability. + # Raises an EmailUndeliverableError on failure. On success, returns a dict + # with deliverability information. + + # If no dns.resolver.Resolver was given, get dnspython's default resolver. + # Override the default resolver's timeout. This may affect other uses of + # dnspython in this process. + if dns_resolver is None: + from . import DEFAULT_TIMEOUT + if timeout is None: + timeout = DEFAULT_TIMEOUT + dns_resolver = dns.resolver.get_default_resolver() + dns_resolver.lifetime = timeout + + deliverability_info = {} + + def dns_resolver_resolve_shim(domain, record): + try: + # dns.resolver.Resolver.resolve is new to dnspython 2.x. + # https://dnspython.readthedocs.io/en/latest/resolver-class.html#dns.resolver.Resolver.resolve + return dns_resolver.resolve(domain, record) + except AttributeError: + # dnspython 2.x is only available in Python 3.6 and later. For earlier versions + # of Python, we maintain compatibility with dnspython 1.x which has a + # dnspython.resolver.Resolver.query method instead. The only difference is that + # query may treat the domain as relative and use the system's search domains, + # which we prevent by adding a "." to the domain name to make it absolute. + # dns.resolver.Resolver.query is deprecated in dnspython version 2.x. + # https://dnspython.readthedocs.io/en/latest/resolver-class.html#dns.resolver.Resolver.query + return dns_resolver.query(domain + ".", record) + + try: + # We need a way to check how timeouts are handled in the tests. So we + # have a secret variable that if set makes this method always test the + # handling of a timeout. + if getattr(validate_email_deliverability, 'TEST_CHECK_TIMEOUT', False): + raise dns.exception.Timeout() + + try: + # Try resolving for MX records. + response = dns_resolver_resolve_shim(domain, "MX") + + # For reporting, put them in priority order and remove the trailing dot in the qnames. + mtas = sorted([(r.preference, str(r.exchange).rstrip('.')) for r in response]) + + # Remove "null MX" records from the list (their value is (0, ".") but we've stripped + # trailing dots, so the 'exchange' is just ""). If there was only a null MX record, + # email is not deliverable. + mtas = [(preference, exchange) for preference, exchange in mtas + if exchange != ""] + if len(mtas) == 0: + raise EmailUndeliverableError("The domain name %s does not accept email." % domain_i18n) + + deliverability_info["mx"] = mtas + deliverability_info["mx_fallback_type"] = None + + except (dns.resolver.NoNameservers, dns.resolver.NXDOMAIN, dns.resolver.NoAnswer): + + # If there was no MX record, fall back to an A record, as SMTP servers do. + try: + response = dns_resolver_resolve_shim(domain, "A") + deliverability_info["mx"] = [(0, str(r)) for r in response] + deliverability_info["mx_fallback_type"] = "A" + except (dns.resolver.NoNameservers, dns.resolver.NXDOMAIN, dns.resolver.NoAnswer): + + # If there was no A record, fall back to an AAAA record. + try: + response = dns_resolver_resolve_shim(domain, "AAAA") + deliverability_info["mx"] = [(0, str(r)) for r in response] + deliverability_info["mx_fallback_type"] = "AAAA" + except (dns.resolver.NoNameservers, dns.resolver.NXDOMAIN, dns.resolver.NoAnswer): + + # If there was no MX, A, or AAAA record, then mail to + # this domain is not deliverable. + raise EmailUndeliverableError("The domain name %s does not exist." % domain_i18n) + + # Check for a SPF reject-all record ("v=spf1 -all") which indicates + # no emails are sent from this domain (similar to a NULL MX record + # but for sending rather than receiving). In combination with the + # absence of an MX record, this is probably a good sign that the + # domain is not used for email. + try: + response = dns_resolver_resolve_shim(domain, "TXT") + for rec in response: + value = b"".join(rec.strings) + if value.startswith(b"v=spf1 "): + deliverability_info["spf"] = value.decode("ascii", errors='replace') + if value == b"v=spf1 -all": + raise EmailUndeliverableError("The domain name %s does not send email." % domain_i18n) + except dns.resolver.NoAnswer: + # No TXT records means there is no SPF policy, so we cannot take any action. + pass + except (dns.resolver.NoNameservers, dns.resolver.NXDOMAIN): + # Failure to resolve at this step will be ignored. + pass + + except dns.exception.Timeout: + # A timeout could occur for various reasons, so don't treat it as a failure. + return { + "unknown-deliverability": "timeout", + } + + except EmailUndeliverableError: + # Don't let these get clobbered by the wider except block below. + raise + + except Exception as e: + # Unhandled conditions should not propagate. + raise EmailUndeliverableError( + "There was an error while checking if the domain name in the email address is deliverable: " + str(e) + ) + + return deliverability_info diff --git a/email_validator/exceptions_types.py b/email_validator/exceptions_types.py new file mode 100644 index 0000000..4fb913d --- /dev/null +++ b/email_validator/exceptions_types.py @@ -0,0 +1,122 @@ +class EmailNotValidError(ValueError): + """Parent class of all exceptions raised by this module.""" + pass + + +class EmailSyntaxError(EmailNotValidError): + """Exception raised when an email address fails validation because of its form.""" + pass + + +class EmailUndeliverableError(EmailNotValidError): + """Exception raised when an email address fails validation because its domain name does not appear deliverable.""" + pass + + +class ValidatedEmail(object): + """The validate_email function returns objects of this type holding the normalized form of the email address + and other information.""" + + """The email address that was passed to validate_email. (If passed as bytes, this will be a string.)""" + original_email = None + + """The normalized email address, which should always be used in preferance to the original address. + The normalized address converts an IDNA ASCII domain name to Unicode, if possible, and performs + Unicode normalization on the local part and on the domain (if originally Unicode). It is the + concatenation of the local_part and domain attributes, separated by an @-sign.""" + email = None + + """The local part of the email address after Unicode normalization.""" + local_part = None + + """The domain part of the email address after Unicode normalization or conversion to + Unicode from IDNA ascii.""" + domain = None + + """If not None, a form of the email address that uses 7-bit ASCII characters only.""" + ascii_email = None + + """If not None, the local part of the email address using 7-bit ASCII characters only.""" + ascii_local_part = None + + """If not None, a form of the domain name that uses 7-bit ASCII characters only.""" + ascii_domain = None + + """If True, the SMTPUTF8 feature of your mail relay will be required to transmit messages + to this address. This flag is True just when ascii_local_part is missing. Otherwise it + is False.""" + smtputf8 = None + + """If a deliverability check is performed and if it succeeds, a list of (priority, domain) + tuples of MX records specified in the DNS for the domain.""" + mx = None + + """If no MX records are actually specified in DNS and instead are inferred, through an obsolete + mechanism, from A or AAAA records, the value is the type of DNS record used instead (`A` or `AAAA`).""" + mx_fallback_type = None + + """Tests use this constructor.""" + def __init__(self, **kwargs): + for k, v in kwargs.items(): + setattr(self, k, v) + + """As a convenience, str(...) on instances of this class return the normalized address.""" + def __self__(self): + return self.normalized_email + + def __repr__(self): + return "".format(self.email) + + """For backwards compatibility, some fields are also exposed through a dict-like interface. Note + that some of the names changed when they became attributes.""" + def __getitem__(self, key): + if key == "email": + return self.email + if key == "email_ascii": + return self.ascii_email + if key == "local": + return self.local_part + if key == "domain": + return self.ascii_domain + if key == "domain_i18n": + return self.domain + if key == "smtputf8": + return self.smtputf8 + if key == "mx": + return self.mx + if key == "mx-fallback": + return self.mx_fallback_type + raise KeyError() + + """Tests use this.""" + def __eq__(self, other): + if not isinstance(other, ValidatedEmail): + return False + return ( + self.email == other.email + and self.local_part == other.local_part + and self.domain == other.domain + and self.ascii_email == other.ascii_email + and self.ascii_local_part == other.ascii_local_part + and self.ascii_domain == other.ascii_domain + and self.smtputf8 == other.smtputf8 + and repr(sorted(self.mx) if self.mx else self.mx) + == repr(sorted(other.mx) if other.mx else other.mx) + and self.mx_fallback_type == other.mx_fallback_type + ) + + """This helps producing the README.""" + def as_constructor(self): + return "ValidatedEmail(" \ + + ",".join("\n {}={}".format( + key, + repr(getattr(self, key))) + for key in ('email', 'local_part', 'domain', + 'ascii_email', 'ascii_local_part', 'ascii_domain', + 'smtputf8', 'mx', 'mx_fallback_type') + ) \ + + ")" + + """Convenience method for accessing ValidatedEmail as a dict""" + def as_dict(self): + return self.__dict__ diff --git a/email_validator/rfc_constants.py b/email_validator/rfc_constants.py new file mode 100644 index 0000000..0ad9467 --- /dev/null +++ b/email_validator/rfc_constants.py @@ -0,0 +1,39 @@ +import sys + +# These constants are defined by the email specifications. + +# Based on RFC 2822 section 3.2.4 / RFC 5322 section 3.2.3, these +# characters are permitted in email addresses (not taking into +# account internationalization): +ATEXT = r'a-zA-Z0-9_!#\$%&\'\*\+\-/=\?\^`\{\|\}~' + +# A "dot atom text", per RFC 2822 3.2.4: +DOT_ATOM_TEXT = '[' + ATEXT + ']+(?:\\.[' + ATEXT + ']+)*' + +# RFC 6531 section 3.3 extends the allowed characters in internationalized +# addresses to also include three specific ranges of UTF8 defined in +# RFC3629 section 4, which appear to be the Unicode code points from +# U+0080 to U+10FFFF. +ATEXT_INTL = ATEXT + u"\u0080-\U0010FFFF" +DOT_ATOM_TEXT_INTL = '[' + ATEXT_INTL + ']+(?:\\.[' + ATEXT_INTL + ']+)*' + +# The domain part of the email address, after IDNA (ASCII) encoding, +# must also satisfy the requirements of RFC 952/RFC 1123 which restrict +# the allowed characters of hostnames further. The hyphen cannot be at +# the beginning or end of a *dot-atom component* of a hostname either. +ATEXT_HOSTNAME = r'(?:(?:[a-zA-Z0-9][a-zA-Z0-9\-]*)?[a-zA-Z0-9])' + +# Length constants +# RFC 3696 + errata 1003 + errata 1690 (https://www.rfc-editor.org/errata_search.php?rfc=3696&eid=1690) +# explains the maximum length of an email address is 254 octets. +EMAIL_MAX_LENGTH = 254 +LOCAL_PART_MAX_LENGTH = 64 +DOMAIN_MAX_LENGTH = 255 + + +# In Python 2.x, turn the regexes above from bytes regexes into unicode +# regexes. If Python 3.x had a "ur" string literal prefix we'd use that instead. +if sys.version_info < (3,): + ATEXT = ATEXT.decode("ascii") + DOT_ATOM_TEXT = DOT_ATOM_TEXT.decode("ascii") + ATEXT_HOSTNAME = ATEXT_HOSTNAME.decode("ascii") diff --git a/email_validator/syntax.py b/email_validator/syntax.py new file mode 100644 index 0000000..a3a44eb --- /dev/null +++ b/email_validator/syntax.py @@ -0,0 +1,232 @@ +from .exceptions_types import EmailSyntaxError +from .rfc_constants import EMAIL_MAX_LENGTH, LOCAL_PART_MAX_LENGTH, DOMAIN_MAX_LENGTH, \ + DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT, ATEXT_HOSTNAME, ATEXT_INTL + +import re +import unicodedata +import idna # implements IDNA 2008; Python's codec is only IDNA 2003 + + +def get_length_reason(addr, utf8=False, limit=EMAIL_MAX_LENGTH): + """Helper function to return an error message related to invalid length.""" + diff = len(addr) - limit + reason = "({}{} character{} too many)" + prefix = "at least " if utf8 else "" + suffix = "s" if diff > 1 else "" + return reason.format(prefix, diff, suffix) + + +def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=False): + """Validates the syntax of the local part of an email address.""" + + if len(local) == 0: + if not allow_empty_local: + raise EmailSyntaxError("There must be something before the @-sign.") + else: + # The caller allows an empty local part. Useful for validating certain + # Postfix aliases. + return { + "local_part": local, + "ascii_local_part": local, + "smtputf8": False, + } + + # RFC 5321 4.5.3.1.1 + # We're checking the number of characters here. If the local part + # is ASCII-only, then that's the same as bytes (octets). If it's + # internationalized, then the UTF-8 encoding may be longer, but + # that may not be relevant. We will check the total address length + # instead. + if len(local) > LOCAL_PART_MAX_LENGTH: + reason = get_length_reason(local, limit=LOCAL_PART_MAX_LENGTH) + raise EmailSyntaxError("The email address is too long before the @-sign {}.".format(reason)) + + # Check the local part against the regular expression for the older ASCII requirements. + m = re.match(DOT_ATOM_TEXT + "\\Z", local) + if m: + # Return the local part unchanged and flag that SMTPUTF8 is not needed. + return { + "local_part": local, + "ascii_local_part": local, + "smtputf8": False, + } + + else: + # The local part failed the ASCII check. Now try the extended internationalized requirements. + m = re.match(DOT_ATOM_TEXT_INTL + "\\Z", local) + if not m: + # It's not a valid internationalized address either. Report which characters were not valid. + bad_chars = ', '.join(sorted(set( + unicodedata.name(c, repr(c)) for c in local if not re.match(u"[" + (ATEXT if not allow_smtputf8 else ATEXT_INTL) + u"]", c) + ))) + raise EmailSyntaxError("The email address contains invalid characters before the @-sign: %s." % bad_chars) + + # It would be valid if internationalized characters were allowed by the caller. + if not allow_smtputf8: + raise EmailSyntaxError("Internationalized characters before the @-sign are not supported.") + + # It's valid. + + # RFC 6532 section 3.1 also says that Unicode NFC normalization should be applied, + # so we'll return the normalized local part in the return value. + local = unicodedata.normalize("NFC", local) + + # Check for unsafe characters. + # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked + # by DOT_ATOM_TEXT_INTL. + for i, c in enumerate(local): + category = unicodedata.category(c) + if category[0] in ("L", "N", "P", "S"): + # letters, numbers, punctuation, and symbols are permitted + pass + elif category[0] == "M": + # combining character in first position would combine with something + # outside of the email address if concatenated to the right, but are + # otherwise permitted + if i == 0: + raise EmailSyntaxError("The email address contains an initial invalid character (%s)." + % unicodedata.name(c, repr(c))) + elif category[0] in ("Z", "C"): + # spaces and line/paragraph characters (Z) and + # control, format, surrogate, private use, and unassigned code points (C) + raise EmailSyntaxError("The email address contains an invalid character (%s)." + % unicodedata.name(c, repr(c))) + else: + # All categories should be handled above, but in case there is something new + # in the future. + raise EmailSyntaxError("The email address contains a character (%s; category %s) that may not be safe." + % (unicodedata.name(c, repr(c)), category)) + + # Try encoding to UTF-8. Failure is possible with some characters like + # surrogate code points, but those are checked above. Still, we don't + # want to have an unhandled exception later. + try: + local.encode("utf8") + except ValueError: + raise EmailSyntaxError("The email address contains an invalid character.") + + # Flag that SMTPUTF8 will be required for deliverability. + return { + "local_part": local, + "ascii_local_part": None, # no ASCII form is possible + "smtputf8": True, + } + + +def validate_email_domain_part(domain, test_environment=False, globally_deliverable=True): + """Validates the syntax of the domain part of an email address.""" + + # Empty? + if len(domain) == 0: + raise EmailSyntaxError("There must be something after the @-sign.") + + # Perform UTS-46 normalization, which includes casefolding, NFC normalization, + # and converting all label separators (the period/full stop, fullwidth full stop, + # ideographic full stop, and halfwidth ideographic full stop) to basic periods. + # It will also raise an exception if there is an invalid character in the input, + # such as "⒈" which is invalid because it would expand to include a period. + try: + domain = idna.uts46_remap(domain, std3_rules=False, transitional=False) + except idna.IDNAError as e: + raise EmailSyntaxError("The domain name %s contains invalid characters (%s)." % (domain, str(e))) + + # Now we can perform basic checks on the use of periods (since equivalent + # symbols have been mapped to periods). These checks are needed because the + # IDNA library doesn't handle well domains that have empty labels (i.e. initial + # dot, trailing dot, or two dots in a row). + if domain.endswith("."): + raise EmailSyntaxError("An email address cannot end with a period.") + if domain.startswith("."): + raise EmailSyntaxError("An email address cannot have a period immediately after the @-sign.") + if ".." in domain: + raise EmailSyntaxError("An email address cannot have two periods in a row.") + + # Regardless of whether international characters are actually used, + # first convert to IDNA ASCII. For ASCII-only domains, the transformation + # does nothing. If internationalized characters are present, the MTA + # must either support SMTPUTF8 or the mail client must convert the + # domain name to IDNA before submission. + # + # Unfortunately this step incorrectly 'fixes' domain names with leading + # periods by removing them, so we have to check for this above. It also gives + # a funky error message ("No input") when there are two periods in a + # row, also checked separately above. + try: + ascii_domain = idna.encode(domain, uts46=False).decode("ascii") + except idna.IDNAError as e: + if "Domain too long" in str(e): + # We can't really be more specific because UTS-46 normalization means + # the length check is applied to a string that is different from the + # one the user supplied. Also I'm not sure if the length check applies + # to the internationalized form, the IDNA ASCII form, or even both! + raise EmailSyntaxError("The email address is too long after the @-sign.") + raise EmailSyntaxError("The domain name %s contains invalid characters (%s)." % (domain, str(e))) + + # We may have been given an IDNA ASCII domain to begin with. Check + # that the domain actually conforms to IDNA. It could look like IDNA + # but not be actual IDNA. For ASCII-only domains, the conversion out + # of IDNA just gives the same thing back. + # + # This gives us the canonical internationalized form of the domain, + # which we should use in all error messages. + try: + domain_i18n = idna.decode(ascii_domain.encode('ascii')) + except idna.IDNAError as e: + raise EmailSyntaxError("The domain name %s is not valid IDNA (%s)." % (ascii_domain, str(e))) + + # RFC 5321 4.5.3.1.2 + # We're checking the number of bytes (octets) here, which can be much + # higher than the number of characters in internationalized domains, + # on the assumption that the domain may be transmitted without SMTPUTF8 + # as IDNA ASCII. This is also checked by idna.encode, so this exception + # is never reached. + if len(ascii_domain) > DOMAIN_MAX_LENGTH: + raise EmailSyntaxError("The email address is too long after the @-sign.") + + # A "dot atom text", per RFC 2822 3.2.4, but using the restricted + # characters allowed in a hostname (see ATEXT_HOSTNAME above). + DOT_ATOM_TEXT = ATEXT_HOSTNAME + r'(?:\.' + ATEXT_HOSTNAME + r')*' + + # Check the regular expression. This is probably entirely redundant + # with idna.decode, which also checks this format. + m = re.match(DOT_ATOM_TEXT + "\\Z", ascii_domain) + if not m: + raise EmailSyntaxError("The email address contains invalid characters after the @-sign.") + + if globally_deliverable: + # All publicly deliverable addresses have domain named with at least + # one period, and we'll consider the lack of a period a syntax error + # since that will match people's sense of what an email address looks + # like. We'll skip this in test environments to allow '@test' email + # addresses. + if "." not in ascii_domain and not (ascii_domain == "test" and test_environment): + raise EmailSyntaxError("The domain name %s is not valid. It should have a period." % domain_i18n) + + # We also know that all TLDs currently end with a letter. + if not re.search(r"[A-Za-z]\Z", ascii_domain): + raise EmailSyntaxError( + "The domain name %s is not valid. It is not within a valid top-level domain." % domain_i18n + ) + + # Check special-use and reserved domain names. + # Some might fail DNS-based deliverability checks, but that + # can be turned off, so we should fail them all sooner. + from . import SPECIAL_USE_DOMAIN_NAMES + for d in SPECIAL_USE_DOMAIN_NAMES: + # See the note near the definition of SPECIAL_USE_DOMAIN_NAMES. + if d == "test" and test_environment: + continue + + if ascii_domain == d or ascii_domain.endswith("." + d): + raise EmailSyntaxError("The domain name %s is a special-use or reserved name that cannot be used with email." % domain_i18n) + + # Return the IDNA ASCII-encoded form of the domain, which is how it + # would be transmitted on the wire (except when used with SMTPUTF8 + # possibly), as well as the canonical Unicode form of the domain, + # which is better for display purposes. This should also take care + # of RFC 6532 section 3.1's suggestion to apply Unicode NFC + # normalization to addresses. + return { + "ascii_domain": ascii_domain, + "domain": domain_i18n, + } diff --git a/email_validator/validate_email.py b/email_validator/validate_email.py new file mode 100644 index 0000000..4a888b4 --- /dev/null +++ b/email_validator/validate_email.py @@ -0,0 +1,132 @@ +import sys + +from .exceptions_types import EmailSyntaxError, ValidatedEmail +from .syntax import validate_email_local_part, validate_email_domain_part, get_length_reason +from .deliverability import validate_email_deliverability +from .rfc_constants import EMAIL_MAX_LENGTH + +# ease compatibility in type checking +if sys.version_info >= (3,): + unicode_class = str +else: + unicode_class = unicode # noqa: F821 + + +def validate_email( + email, + # /, # not supported in Python 3.6, 3.7 + *, + allow_smtputf8=None, + allow_empty_local=False, + check_deliverability=None, + test_environment=None, + globally_deliverable=None, + timeout=None, + dns_resolver=None +): + """ + Validates an email address, raising an EmailNotValidError if the address is not valid or returning a dict of + information when the address is valid. The email argument can be a str or a bytes instance, + but if bytes it must be ASCII-only. This is the main method of this library. + """ + + # Fill in default values of arguments. + from . import ALLOW_SMTPUTF8, CHECK_DELIVERABILITY, TEST_ENVIRONMENT, GLOBALLY_DELIVERABLE, DEFAULT_TIMEOUT + if allow_smtputf8 is None: + allow_smtputf8 = ALLOW_SMTPUTF8 + if check_deliverability is None: + check_deliverability = CHECK_DELIVERABILITY + if test_environment is None: + test_environment = TEST_ENVIRONMENT + if globally_deliverable is None: + globally_deliverable = GLOBALLY_DELIVERABLE + if timeout is None: + timeout = DEFAULT_TIMEOUT + + # Allow email to be a str or bytes instance. If bytes, + # it must be ASCII because that's how the bytes work + # on the wire with SMTP. + if not isinstance(email, (str, unicode_class)): + try: + email = email.decode("ascii") + except ValueError: + raise EmailSyntaxError("The email address is not valid ASCII.") + + # At-sign. + parts = email.split('@') + if len(parts) != 2: + raise EmailSyntaxError("The email address is not valid. It must have exactly one @-sign.") + + # Collect return values in this instance. + ret = ValidatedEmail() + ret.original_email = email + + # Validate the email address's local part syntax and get a normalized form. + local_part_info = validate_email_local_part(parts[0], + allow_smtputf8=allow_smtputf8, + allow_empty_local=allow_empty_local) + ret.local_part = local_part_info["local_part"] + ret.ascii_local_part = local_part_info["ascii_local_part"] + ret.smtputf8 = local_part_info["smtputf8"] + + # Validate the email address's domain part syntax and get a normalized form. + domain_part_info = validate_email_domain_part(parts[1], test_environment=test_environment, globally_deliverable=globally_deliverable) + ret.domain = domain_part_info["domain"] + ret.ascii_domain = domain_part_info["ascii_domain"] + + # Construct the complete normalized form. + ret.email = ret.local_part + "@" + ret.domain + + # If the email address has an ASCII form, add it. + if not ret.smtputf8: + ret.ascii_email = ret.ascii_local_part + "@" + ret.ascii_domain + + # If the email address has an ASCII representation, then we assume it may be + # transmitted in ASCII (we can't assume SMTPUTF8 will be used on all hops to + # the destination) and the length limit applies to ASCII characters (which is + # the same as octets). The number of characters in the internationalized form + # may be many fewer (because IDNA ASCII is verbose) and could be less than 254 + # Unicode characters, and of course the number of octets over the limit may + # not be the number of characters over the limit, so if the email address is + # internationalized, we can't give any simple information about why the address + # is too long. + # + # In addition, check that the UTF-8 encoding (i.e. not IDNA ASCII and not + # Unicode characters) is at most 254 octets. If the addres is transmitted using + # SMTPUTF8, then the length limit probably applies to the UTF-8 encoded octets. + # If the email address has an ASCII form that differs from its internationalized + # form, I don't think the internationalized form can be longer, and so the ASCII + # form length check would be sufficient. If there is no ASCII form, then we have + # to check the UTF-8 encoding. The UTF-8 encoding could be up to about four times + # longer than the number of characters. + # + # See the length checks on the local part and the domain. + if ret.ascii_email and len(ret.ascii_email) > EMAIL_MAX_LENGTH: + if ret.ascii_email == ret.email: + reason = get_length_reason(ret.ascii_email) + elif len(ret.email) > EMAIL_MAX_LENGTH: + # If there are more than 254 characters, then the ASCII + # form is definitely going to be too long. + reason = get_length_reason(ret.email, utf8=True) + else: + reason = "(when converted to IDNA ASCII)" + raise EmailSyntaxError("The email address is too long {}.".format(reason)) + if len(ret.email.encode("utf8")) > EMAIL_MAX_LENGTH: + if len(ret.email) > EMAIL_MAX_LENGTH: + # If there are more than 254 characters, then the UTF-8 + # encoding is definitely going to be too long. + reason = get_length_reason(ret.email, utf8=True) + else: + reason = "(when encoded in bytes)" + raise EmailSyntaxError("The email address is too long {}.".format(reason)) + + if check_deliverability and not test_environment: + # Validate the email address's deliverability using DNS + # and update the return dict with metadata. + deliverability_info = validate_email_deliverability( + ret["domain"], ret["domain_i18n"], timeout, dns_resolver + ) + for key, value in deliverability_info.items(): + setattr(ret, key, value) + + return ret diff --git a/tests/test_main.py b/tests/test_main.py index adcbc1e..89ef596 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -2,10 +2,11 @@ import re import pytest from email_validator import EmailSyntaxError, EmailUndeliverableError, \ - validate_email, validate_email_deliverability, \ - caching_resolver, ValidatedEmail + validate_email, \ + ValidatedEmail +from email_validator.deliverability import caching_resolver, validate_email_deliverability # Let's test main but rename it to be clear -from email_validator import main as validator_main +from email_validator.__main__ import main as validator_command_line_tool @pytest.mark.parametrize( @@ -561,7 +562,7 @@ def test_main_single_good_input(monkeypatch, capsys): import json test_email = "google@google.com" monkeypatch.setattr('sys.argv', ['email_validator', test_email]) - validator_main() + validator_command_line_tool() stdout, _ = capsys.readouterr() output = json.loads(str(stdout)) assert isinstance(output, dict) @@ -571,7 +572,7 @@ def test_main_single_good_input(monkeypatch, capsys): def test_main_single_bad_input(monkeypatch, capsys): bad_email = 'test@..com' monkeypatch.setattr('sys.argv', ['email_validator', bad_email]) - validator_main() + validator_command_line_tool() stdout, _ = capsys.readouterr() assert stdout == 'An email address cannot have a period immediately after the @-sign.\n' @@ -582,7 +583,7 @@ def test_main_multi_input(monkeypatch, capsys): test_input = io.StringIO("\n".join(test_cases)) monkeypatch.setattr('sys.stdin', test_input) monkeypatch.setattr('sys.argv', ['email_validator']) - validator_main() + validator_command_line_tool() stdout, _ = capsys.readouterr() assert test_cases[0] not in stdout assert test_cases[1] not in stdout @@ -595,7 +596,7 @@ def test_main_input_shim(monkeypatch, capsys): monkeypatch.setattr('sys.version_info', (2, 7)) test_email = b"google@google.com" monkeypatch.setattr('sys.argv', ['email_validator', test_email]) - validator_main() + validator_command_line_tool() stdout, _ = capsys.readouterr() output = json.loads(str(stdout)) assert isinstance(output, dict) @@ -606,7 +607,7 @@ def test_main_output_shim(monkeypatch, capsys): monkeypatch.setattr('sys.version_info', (2, 7)) test_email = b"test@.com" monkeypatch.setattr('sys.argv', ['email_validator', test_email]) - validator_main() + validator_command_line_tool() stdout, _ = capsys.readouterr() # This looks bad but it has to do with the way python 2.7 prints vs py3 From 34b5856e94a7d099c0e836c391a0cccbbfe5e0a0 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Mon, 16 Jan 2023 06:37:35 -0500 Subject: [PATCH 056/174] Remove Python 2.x shims and tests since Py2 is not supported anymore See d7fd074eeee47663042730943861980495fc8225. --- README.md | 21 +++++++++++---------- email_validator/__main__.py | 20 ++++---------------- email_validator/rfc_constants.py | 10 ---------- email_validator/validate_email.py | 10 +--------- tests/test_main.py | 25 ------------------------- 5 files changed, 16 insertions(+), 70 deletions(-) diff --git a/README.md b/README.md index 241a809..21c61e2 100644 --- a/README.md +++ b/README.md @@ -251,16 +251,6 @@ part is converted to [IDNA ASCII](https://tools.ietf.org/html/rfc5891). (You probably should not do this at account creation time so you don't change the user's login information without telling them.) -### Support for Python 2.7 - -The last version of this library supporting Python 2.x is version 1.2.1. - -When using Python 2.x, it is required that it was built with -UCS-4 support (see -[here](https://stackoverflow.com/questions/29109944/python-returns-length-of-2-for-single-unicode-character-string)). -Without UCS-4 support, unicode characters outside of the BMP (Basic -Multilingual Plane) will not validate correctly. - Normalization ------------- @@ -407,6 +397,17 @@ or likely to cause trouble: IP address in brackets) is rejected. Other obsolete and deprecated syntaxes are rejected. No one uses these forms anymore. +Support for Python 2.x +---------------------- + +The last version of this library supporting Python 2.x is version 1.2.1. + +When using Python 2.x, it is required that Python be built with +UCS-4 support (see +[here](https://stackoverflow.com/questions/29109944/python-returns-length-of-2-for-single-unicode-character-string)). +Without UCS-4 support, unicode characters outside of the BMP (Basic +Multilingual Plane) will not validate correctly in internationalized addresses. + Testing ------- diff --git a/email_validator/__main__.py b/email_validator/__main__.py index e9a5ea7..a2e69fe 100644 --- a/email_validator/__main__.py +++ b/email_validator/__main__.py @@ -17,36 +17,24 @@ from .exceptions_types import EmailNotValidError -def __utf8_input_shim(input_str): - if sys.version_info < (3,): - return input_str.decode("utf-8") - return input_str - - -def __utf8_output_shim(output_str): - if sys.version_info < (3,): - return unicode_class(output_str).encode("utf-8") - return output_str - - def main(): if len(sys.argv) == 1: # Validate the email addresses pased line-by-line on STDIN. dns_resolver = caching_resolver() for line in sys.stdin: - email = __utf8_input_shim(line.strip()) + email = line.strip() try: validate_email(email, dns_resolver=dns_resolver) except EmailNotValidError as e: - print(__utf8_output_shim("{} {}".format(email, e))) + print("{} {}".format(email, e)) else: # Validate the email address passed on the command line. - email = __utf8_input_shim(sys.argv[1]) + email = sys.argv[1] try: result = validate_email(email) print(json.dumps(result.as_dict(), indent=2, sort_keys=True, ensure_ascii=False)) except EmailNotValidError as e: - print(__utf8_output_shim(e)) + print(e) if __name__ == "__main__": diff --git a/email_validator/rfc_constants.py b/email_validator/rfc_constants.py index 0ad9467..2465c43 100644 --- a/email_validator/rfc_constants.py +++ b/email_validator/rfc_constants.py @@ -1,5 +1,3 @@ -import sys - # These constants are defined by the email specifications. # Based on RFC 2822 section 3.2.4 / RFC 5322 section 3.2.3, these @@ -29,11 +27,3 @@ EMAIL_MAX_LENGTH = 254 LOCAL_PART_MAX_LENGTH = 64 DOMAIN_MAX_LENGTH = 255 - - -# In Python 2.x, turn the regexes above from bytes regexes into unicode -# regexes. If Python 3.x had a "ur" string literal prefix we'd use that instead. -if sys.version_info < (3,): - ATEXT = ATEXT.decode("ascii") - DOT_ATOM_TEXT = DOT_ATOM_TEXT.decode("ascii") - ATEXT_HOSTNAME = ATEXT_HOSTNAME.decode("ascii") diff --git a/email_validator/validate_email.py b/email_validator/validate_email.py index 4a888b4..8bca5fb 100644 --- a/email_validator/validate_email.py +++ b/email_validator/validate_email.py @@ -1,16 +1,8 @@ -import sys - from .exceptions_types import EmailSyntaxError, ValidatedEmail from .syntax import validate_email_local_part, validate_email_domain_part, get_length_reason from .deliverability import validate_email_deliverability from .rfc_constants import EMAIL_MAX_LENGTH -# ease compatibility in type checking -if sys.version_info >= (3,): - unicode_class = str -else: - unicode_class = unicode # noqa: F821 - def validate_email( email, @@ -46,7 +38,7 @@ def validate_email( # Allow email to be a str or bytes instance. If bytes, # it must be ASCII because that's how the bytes work # on the wire with SMTP. - if not isinstance(email, (str, unicode_class)): + if not isinstance(email, str): try: email = email.decode("ascii") except ValueError: diff --git a/tests/test_main.py b/tests/test_main.py index 89ef596..8d559cc 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -591,31 +591,6 @@ def test_main_multi_input(monkeypatch, capsys): assert test_cases[3] in stdout -def test_main_input_shim(monkeypatch, capsys): - import json - monkeypatch.setattr('sys.version_info', (2, 7)) - test_email = b"google@google.com" - monkeypatch.setattr('sys.argv', ['email_validator', test_email]) - validator_command_line_tool() - stdout, _ = capsys.readouterr() - output = json.loads(str(stdout)) - assert isinstance(output, dict) - assert validate_email(test_email).original_email == output["original_email"] - - -def test_main_output_shim(monkeypatch, capsys): - monkeypatch.setattr('sys.version_info', (2, 7)) - test_email = b"test@.com" - monkeypatch.setattr('sys.argv', ['email_validator', test_email]) - validator_command_line_tool() - stdout, _ = capsys.readouterr() - - # This looks bad but it has to do with the way python 2.7 prints vs py3 - # The \n is part of the print statement, not part of the string, which is what the b'...' is - # Since we're mocking py 2.7 here instead of actually using 2.7, this was the closest I could get - assert stdout == "b'An email address cannot have a period immediately after the @-sign.'\n" - - def test_validate_email__with_caching_resolver(): # unittest.mock.patch("dns.resolver.LRUCache.get") doesn't # work --- it causes get to always return an empty list. From 2750a824484c813dcc0128cee44752d966860e7c Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sat, 21 Jan 2023 06:09:00 -0500 Subject: [PATCH 057/174] Drop support for Python <3.6 and dnspython 1.x --- CHANGELOG.md | 1 + README.md | 14 ++++++-------- email_validator/deliverability.py | 23 ++++------------------- setup.cfg | 4 ++-- 4 files changed, 13 insertions(+), 29 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 58c1128..c256496 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ In Development -------------- +* Python versions through 3.5 and dnspython 1.x are no longer supported. Python 3.6+ with dnspython 2.x are now required. * The library has been reorganized internally into smaller modules. Version 1.3.1 (January 21, 2023) diff --git a/README.md b/README.md index 21c61e2..55017ad 100644 --- a/README.md +++ b/README.md @@ -2,19 +2,19 @@ email-validator: Validate Email Addresses ========================================= A robust email address syntax and deliverability validation library for -Python by [Joshua Tauberer](https://joshdata.me). +Python 3.6+ by [Joshua Tauberer](https://joshdata.me). -This library validates that a string is of the form `name@example.com`. This is -the sort of validation you would want for an email-based login form on -a website. +This library validates that a string is of the form `name@example.com` and optionally checks that the domain name is set up to receive email. This is +the sort of validation you would want for an email-based registration form on +a website (but not necessarily for composing an email message). Key features: * Checks that an email address has the correct syntax --- good for - login forms or other uses related to identifying users. + registration/login forms or other uses related to identifying users. * Gives friendly error messages when validation fails (appropriate to show to end users). -* (optionally) Checks deliverability: Does the domain name resolve? And you can override the default DNS resolver. +* (optionally) Checks deliverability: Does the domain name resolve? You can override the default DNS resolver. * Supports internationalized domain names and (optionally) internationalized local parts, but blocks unsafe characters. * Normalizes email addresses (super important for internationalized @@ -27,8 +27,6 @@ And this library does NOT permit obsolete forms of email addresses, so if you need strict validation against the email specs exactly, use [pyIsEmail](https://github.com/michaelherold/pyIsEmail). -This library is tested with Python 3.6+ but should work in earlier versions: - [![Build Status](https://app.travis-ci.com/JoshData/python-email-validator.svg?branch=main)](https://app.travis-ci.com/JoshData/python-email-validator) View the [CHANGELOG / Release Notes](CHANGELOG.md) for the version history of changes in the library. Occasionally this README is ahead of the latest published package --- see the CHANGELOG for details. diff --git a/email_validator/deliverability.py b/email_validator/deliverability.py index 2b4ec96..bd1a6c4 100644 --- a/email_validator/deliverability.py +++ b/email_validator/deliverability.py @@ -32,21 +32,6 @@ def validate_email_deliverability(domain, domain_i18n, timeout=None, dns_resolve deliverability_info = {} - def dns_resolver_resolve_shim(domain, record): - try: - # dns.resolver.Resolver.resolve is new to dnspython 2.x. - # https://dnspython.readthedocs.io/en/latest/resolver-class.html#dns.resolver.Resolver.resolve - return dns_resolver.resolve(domain, record) - except AttributeError: - # dnspython 2.x is only available in Python 3.6 and later. For earlier versions - # of Python, we maintain compatibility with dnspython 1.x which has a - # dnspython.resolver.Resolver.query method instead. The only difference is that - # query may treat the domain as relative and use the system's search domains, - # which we prevent by adding a "." to the domain name to make it absolute. - # dns.resolver.Resolver.query is deprecated in dnspython version 2.x. - # https://dnspython.readthedocs.io/en/latest/resolver-class.html#dns.resolver.Resolver.query - return dns_resolver.query(domain + ".", record) - try: # We need a way to check how timeouts are handled in the tests. So we # have a secret variable that if set makes this method always test the @@ -56,7 +41,7 @@ def dns_resolver_resolve_shim(domain, record): try: # Try resolving for MX records. - response = dns_resolver_resolve_shim(domain, "MX") + response = dns_resolver.resolve(domain, "MX") # For reporting, put them in priority order and remove the trailing dot in the qnames. mtas = sorted([(r.preference, str(r.exchange).rstrip('.')) for r in response]) @@ -76,14 +61,14 @@ def dns_resolver_resolve_shim(domain, record): # If there was no MX record, fall back to an A record, as SMTP servers do. try: - response = dns_resolver_resolve_shim(domain, "A") + response = dns_resolver.resolve(domain, "A") deliverability_info["mx"] = [(0, str(r)) for r in response] deliverability_info["mx_fallback_type"] = "A" except (dns.resolver.NoNameservers, dns.resolver.NXDOMAIN, dns.resolver.NoAnswer): # If there was no A record, fall back to an AAAA record. try: - response = dns_resolver_resolve_shim(domain, "AAAA") + response = dns_resolver.resolve(domain, "AAAA") deliverability_info["mx"] = [(0, str(r)) for r in response] deliverability_info["mx_fallback_type"] = "AAAA" except (dns.resolver.NoNameservers, dns.resolver.NXDOMAIN, dns.resolver.NoAnswer): @@ -98,7 +83,7 @@ def dns_resolver_resolve_shim(domain, record): # absence of an MX record, this is probably a good sign that the # domain is not used for email. try: - response = dns_resolver_resolve_shim(domain, "TXT") + response = dns_resolver.resolve(domain, "TXT") for rec in response: value = b"".join(rec.strings) if value.startswith(b"v=spf1 "): diff --git a/setup.cfg b/setup.cfg index 6a92d0a..14a8941 100644 --- a/setup.cfg +++ b/setup.cfg @@ -24,9 +24,9 @@ keywords = email address validator [options] packages = find: install_requires = - dnspython>=1.15.0 + dnspython>=2.0.0 idna>=2.0.0 -python_requires = >=3.5 +python_requires = >=3.6 [options.entry_points] console_scripts = From 3599b22ccf38075886651d708ba1505c6c670f60 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sat, 21 Jan 2023 06:28:22 -0500 Subject: [PATCH 058/174] Update pinned package versions in test_requirements.txt --- test_requirements.txt | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/test_requirements.txt b/test_requirements.txt index 38dab84..f793d1c 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -1,26 +1,27 @@ # This file was generated by running +# sudo docker run --rm -it --network=host python:3.6.15-slim /bin/bash # pip install dnspython idna # from setup.cfg # pip install pytest pytest-cov coverage flake8 # pip freeze # in a virtualenv with Python 3.6. (Some packages' latest versions # are not compatible with Python 3.6, so we must pin versions for # repeatable testing in earlier versions of Python.) -attrs==21.4.0 +attrs==22.2.0 coverage==6.2 dnspython==2.2.1 -flake8==4.0.1 -idna==3.3 +flake8==5.0.4 +idna==3.4 importlib-metadata==4.2.0 iniconfig==1.1.1 -mccabe==0.6.1 +mccabe==0.7.0 packaging==21.3 pluggy==1.0.0 py==1.11.0 -pycodestyle==2.8.0 -pyflakes==2.4.0 -pyparsing==3.0.7 +pycodestyle==2.9.1 +pyflakes==2.5.0 +pyparsing==3.0.9 pytest==7.0.1 -pytest-cov==3.0.0 +pytest-cov==4.0.0 tomli==1.2.3 typing_extensions==4.1.1 zipp==3.6.0 From 09012db1e2546eb6057ee5b58a9b37f01aef8368 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sat, 21 Jan 2023 06:01:18 -0500 Subject: [PATCH 059/174] Add Python 3.7, 3.8 back to Travis builds, add 3.12-dev. 3.7, 3.8 are working again. 3.11 is failing without a helpful error. --- .travis.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index d0d8d02..e55ca23 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,10 +5,12 @@ cache: pip python: - '3.6' -#- '3.7' -#- '3.8' +- '3.7' +- '3.8' - '3.9' - '3.10' +#- '3.11' +- '3.12-dev' install: - make install From ea71dfabd0070c18ebb5365ea2806ffa401eba33 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sat, 21 Jan 2023 07:21:54 -0500 Subject: [PATCH 060/174] Implement domain name length checks without relying on the IDNA package --- email_validator/rfc_constants.py | 4 +- email_validator/syntax.py | 107 ++++++++++++++++--------------- tests/test_main.py | 5 +- 3 files changed, 62 insertions(+), 54 deletions(-) diff --git a/email_validator/rfc_constants.py b/email_validator/rfc_constants.py index 2465c43..43f02c2 100644 --- a/email_validator/rfc_constants.py +++ b/email_validator/rfc_constants.py @@ -20,10 +20,12 @@ # the allowed characters of hostnames further. The hyphen cannot be at # the beginning or end of a *dot-atom component* of a hostname either. ATEXT_HOSTNAME = r'(?:(?:[a-zA-Z0-9][a-zA-Z0-9\-]*)?[a-zA-Z0-9])' +DOT_ATOM_TEXT_HOSTNAME = ATEXT_HOSTNAME + r'(?:\.' + ATEXT_HOSTNAME + r')*' # Length constants # RFC 3696 + errata 1003 + errata 1690 (https://www.rfc-editor.org/errata_search.php?rfc=3696&eid=1690) # explains the maximum length of an email address is 254 octets. EMAIL_MAX_LENGTH = 254 LOCAL_PART_MAX_LENGTH = 64 -DOMAIN_MAX_LENGTH = 255 +DNS_LABEL_LENGTH_LIMIT = 63 # RFC 1035 2.3.1 +DOMAIN_MAX_LENGTH = 255 # RFC 1035 2.3.4 diff --git a/email_validator/syntax.py b/email_validator/syntax.py index a3a44eb..d992467 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -1,6 +1,6 @@ from .exceptions_types import EmailSyntaxError from .rfc_constants import EMAIL_MAX_LENGTH, LOCAL_PART_MAX_LENGTH, DOMAIN_MAX_LENGTH, \ - DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT, ATEXT_HOSTNAME, ATEXT_INTL + DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT, ATEXT_INTL, DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME import re import unicodedata @@ -141,57 +141,52 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera if ".." in domain: raise EmailSyntaxError("An email address cannot have two periods in a row.") - # Regardless of whether international characters are actually used, - # first convert to IDNA ASCII. For ASCII-only domains, the transformation - # does nothing. If internationalized characters are present, the MTA - # must either support SMTPUTF8 or the mail client must convert the - # domain name to IDNA before submission. - # - # Unfortunately this step incorrectly 'fixes' domain names with leading - # periods by removing them, so we have to check for this above. It also gives - # a funky error message ("No input") when there are two periods in a - # row, also checked separately above. - try: - ascii_domain = idna.encode(domain, uts46=False).decode("ascii") - except idna.IDNAError as e: - if "Domain too long" in str(e): - # We can't really be more specific because UTS-46 normalization means - # the length check is applied to a string that is different from the - # one the user supplied. Also I'm not sure if the length check applies - # to the internationalized form, the IDNA ASCII form, or even both! - raise EmailSyntaxError("The email address is too long after the @-sign.") - raise EmailSyntaxError("The domain name %s contains invalid characters (%s)." % (domain, str(e))) - - # We may have been given an IDNA ASCII domain to begin with. Check - # that the domain actually conforms to IDNA. It could look like IDNA - # but not be actual IDNA. For ASCII-only domains, the conversion out - # of IDNA just gives the same thing back. - # - # This gives us the canonical internationalized form of the domain, - # which we should use in all error messages. - try: - domain_i18n = idna.decode(ascii_domain.encode('ascii')) - except idna.IDNAError as e: - raise EmailSyntaxError("The domain name %s is not valid IDNA (%s)." % (ascii_domain, str(e))) + if re.match(DOT_ATOM_TEXT_HOSTNAME + "\\Z", domain): + ascii_domain = domain + else: + # If international characters are present in the domain name, convert + # the domain to IDNA ASCII. If internationalized characters are present, + # the MTA must either support SMTPUTF8 or the mail client must convert the + # domain name to IDNA before submission. + # + # Unfortunately this step incorrectly 'fixes' domain names with leading + # periods by removing them, so we have to check for this above. It also gives + # a funky error message ("No input") when there are two periods in a + # row, also checked separately above. + # + # For ASCII-only domains, the transformation does nothing and is safe to + # apply. However, to ensure we don't rely on the idna library for basic + # syntax checks, we don't use it if it's not needed. + try: + ascii_domain = idna.encode(domain, uts46=False).decode("ascii") + except idna.IDNAError as e: + if "Domain too long" in str(e): + # We can't really be more specific because UTS-46 normalization means + # the length check is applied to a string that is different from the + # one the user supplied. Also I'm not sure if the length check applies + # to the internationalized form, the IDNA ASCII form, or even both! + raise EmailSyntaxError("The email address is too long after the @-sign.") + raise EmailSyntaxError("The domain name %s contains invalid characters (%s)." % (domain, str(e))) + + # Check the syntax of the string returned by idna.encode. + # It should never fail. + m = re.match(DOT_ATOM_TEXT_HOSTNAME + "\\Z", ascii_domain) + if not m: + raise EmailSyntaxError("The email address contains invalid characters after the @-sign after IDNA encoding.") # RFC 5321 4.5.3.1.2 # We're checking the number of bytes (octets) here, which can be much # higher than the number of characters in internationalized domains, # on the assumption that the domain may be transmitted without SMTPUTF8 - # as IDNA ASCII. This is also checked by idna.encode, so this exception - # is never reached. + # as IDNA ASCII. (This is also checked by idna.encode, so this exception + # is never reached for internationalized domains.) if len(ascii_domain) > DOMAIN_MAX_LENGTH: - raise EmailSyntaxError("The email address is too long after the @-sign.") - - # A "dot atom text", per RFC 2822 3.2.4, but using the restricted - # characters allowed in a hostname (see ATEXT_HOSTNAME above). - DOT_ATOM_TEXT = ATEXT_HOSTNAME + r'(?:\.' + ATEXT_HOSTNAME + r')*' - - # Check the regular expression. This is probably entirely redundant - # with idna.decode, which also checks this format. - m = re.match(DOT_ATOM_TEXT + "\\Z", ascii_domain) - if not m: - raise EmailSyntaxError("The email address contains invalid characters after the @-sign.") + reason = get_length_reason(ascii_domain, limit=DOMAIN_MAX_LENGTH) + raise EmailSyntaxError("The email address is too long after the @-sign {}.".format(reason)) + for label in ascii_domain.split("."): + if len(label) > DNS_LABEL_LENGTH_LIMIT: + reason = get_length_reason(label, limit=DNS_LABEL_LENGTH_LIMIT) + raise EmailSyntaxError("The part of the email address \"{}\" is too long {}.".format(label, reason)) if globally_deliverable: # All publicly deliverable addresses have domain named with at least @@ -200,13 +195,11 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera # like. We'll skip this in test environments to allow '@test' email # addresses. if "." not in ascii_domain and not (ascii_domain == "test" and test_environment): - raise EmailSyntaxError("The domain name %s is not valid. It should have a period." % domain_i18n) + raise EmailSyntaxError("The part after the @-sign is not valid. It should have a period.") # We also know that all TLDs currently end with a letter. if not re.search(r"[A-Za-z]\Z", ascii_domain): - raise EmailSyntaxError( - "The domain name %s is not valid. It is not within a valid top-level domain." % domain_i18n - ) + raise EmailSyntaxError("The part after the @-sign is not valid. It is not within a valid top-level domain.") # Check special-use and reserved domain names. # Some might fail DNS-based deliverability checks, but that @@ -218,7 +211,19 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera continue if ascii_domain == d or ascii_domain.endswith("." + d): - raise EmailSyntaxError("The domain name %s is a special-use or reserved name that cannot be used with email." % domain_i18n) + raise EmailSyntaxError("The part after the @-sign is a special-use or reserved name that cannot be used with email.") + + # We may have been given an IDNA ASCII domain to begin with. Check + # that the domain actually conforms to IDNA. It could look like IDNA + # but not be actual IDNA. For ASCII-only domains, the conversion out + # of IDNA just gives the same thing back. + # + # This gives us the canonical internationalized form of the domain, + # which we should use in all error messages. + try: + domain_i18n = idna.decode(ascii_domain.encode('ascii')) + except idna.IDNAError as e: + raise EmailSyntaxError("The domain name %s is not valid IDNA (%s)." % (ascii_domain, str(e))) # Return the IDNA ASCII-encoded form of the domain, which is how it # would be transmitted on the wire (except when used with SMTPUTF8 diff --git a/tests/test_main.py b/tests/test_main.py index 8d559cc..08b3c9f 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -212,7 +212,7 @@ def test_email_valid(email_input, output): @pytest.mark.parametrize( 'email_input,error_msg', [ - ('my@localhost', 'The domain name localhost is not valid. It should have a period.'), + ('my@localhost', 'The part after the @-sign is not valid. It should have a period.'), ('my@.leadingdot.com', 'An email address cannot have a period immediately after the @-sign.'), ('my@..leadingfwdot.com', 'An email address cannot have a period immediately after the @-sign.'), ('my@..twodots.com', 'An email address cannot have a period immediately after the @-sign.'), @@ -241,7 +241,8 @@ def test_email_valid(email_input, output): ('my\n@example.com', 'The email address contains invalid characters before the @-sign: \'\\n\'.'), ('11111111112222222222333333333344444444445555555555666666666677777@example.com', 'The email address is too long before the @-sign (1 character too many).'), ('111111111122222222223333333333444444444455555555556666666666777777@example.com', 'The email address is too long before the @-sign (2 characters too many).'), - ('me@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.111111111122222222223333333333444444444455555555556.com', 'The email address is too long after the @-sign.'), + ('me@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.111111111122222222223333333333444444444455555555556.com', 'The email address is too long (4 characters too many).'), + ('me@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555566.com', 'The email address is too long after the @-sign (1 character too many).'), ('my.long.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333333344444.info', 'The email address is too long (2 characters too many).'), ('my.long.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333.info', 'The email address is too long (when converted to IDNA ASCII).'), ('my.long.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (at least 1 character too many).'), From 75c2136ffd8e7ee3d85b7185a0a9d7a8058d076a Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sat, 21 Jan 2023 07:28:05 -0500 Subject: [PATCH 061/174] Compile some of the regexes at the module level --- email_validator/rfc_constants.py | 9 ++++++--- email_validator/syntax.py | 12 ++++++------ 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/email_validator/rfc_constants.py b/email_validator/rfc_constants.py index 43f02c2..afe0982 100644 --- a/email_validator/rfc_constants.py +++ b/email_validator/rfc_constants.py @@ -1,26 +1,29 @@ # These constants are defined by the email specifications. +import re + # Based on RFC 2822 section 3.2.4 / RFC 5322 section 3.2.3, these # characters are permitted in email addresses (not taking into # account internationalization): ATEXT = r'a-zA-Z0-9_!#\$%&\'\*\+\-/=\?\^`\{\|\}~' # A "dot atom text", per RFC 2822 3.2.4: -DOT_ATOM_TEXT = '[' + ATEXT + ']+(?:\\.[' + ATEXT + ']+)*' +DOT_ATOM_TEXT = re.compile('[' + ATEXT + ']+(?:\\.[' + ATEXT + r']+)*\Z') # RFC 6531 section 3.3 extends the allowed characters in internationalized # addresses to also include three specific ranges of UTF8 defined in # RFC3629 section 4, which appear to be the Unicode code points from # U+0080 to U+10FFFF. ATEXT_INTL = ATEXT + u"\u0080-\U0010FFFF" -DOT_ATOM_TEXT_INTL = '[' + ATEXT_INTL + ']+(?:\\.[' + ATEXT_INTL + ']+)*' +DOT_ATOM_TEXT_INTL = re.compile('[' + ATEXT_INTL + ']+(?:\\.[' + ATEXT_INTL + r']+)*\Z') # The domain part of the email address, after IDNA (ASCII) encoding, # must also satisfy the requirements of RFC 952/RFC 1123 which restrict # the allowed characters of hostnames further. The hyphen cannot be at # the beginning or end of a *dot-atom component* of a hostname either. ATEXT_HOSTNAME = r'(?:(?:[a-zA-Z0-9][a-zA-Z0-9\-]*)?[a-zA-Z0-9])' -DOT_ATOM_TEXT_HOSTNAME = ATEXT_HOSTNAME + r'(?:\.' + ATEXT_HOSTNAME + r')*' +DOT_ATOM_TEXT_HOSTNAME = re.compile(ATEXT_HOSTNAME + r'(?:\.' + ATEXT_HOSTNAME + r')*\Z') +DOMAIN_NAME_REGEX = re.compile(r"[A-Za-z]\Z") # all TLDs currently end with a letter # Length constants # RFC 3696 + errata 1003 + errata 1690 (https://www.rfc-editor.org/errata_search.php?rfc=3696&eid=1690) diff --git a/email_validator/syntax.py b/email_validator/syntax.py index d992467..e028163 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -1,6 +1,6 @@ from .exceptions_types import EmailSyntaxError from .rfc_constants import EMAIL_MAX_LENGTH, LOCAL_PART_MAX_LENGTH, DOMAIN_MAX_LENGTH, \ - DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT, ATEXT_INTL, DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME + DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT, ATEXT_INTL, DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX import re import unicodedata @@ -42,7 +42,7 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals raise EmailSyntaxError("The email address is too long before the @-sign {}.".format(reason)) # Check the local part against the regular expression for the older ASCII requirements. - m = re.match(DOT_ATOM_TEXT + "\\Z", local) + m = DOT_ATOM_TEXT.match(local) if m: # Return the local part unchanged and flag that SMTPUTF8 is not needed. return { @@ -53,7 +53,7 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals else: # The local part failed the ASCII check. Now try the extended internationalized requirements. - m = re.match(DOT_ATOM_TEXT_INTL + "\\Z", local) + m = DOT_ATOM_TEXT_INTL.match(local) if not m: # It's not a valid internationalized address either. Report which characters were not valid. bad_chars = ', '.join(sorted(set( @@ -141,7 +141,7 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera if ".." in domain: raise EmailSyntaxError("An email address cannot have two periods in a row.") - if re.match(DOT_ATOM_TEXT_HOSTNAME + "\\Z", domain): + if DOT_ATOM_TEXT_HOSTNAME.match(domain): ascii_domain = domain else: # If international characters are present in the domain name, convert @@ -170,7 +170,7 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera # Check the syntax of the string returned by idna.encode. # It should never fail. - m = re.match(DOT_ATOM_TEXT_HOSTNAME + "\\Z", ascii_domain) + m = DOT_ATOM_TEXT_HOSTNAME.match(ascii_domain) if not m: raise EmailSyntaxError("The email address contains invalid characters after the @-sign after IDNA encoding.") @@ -198,7 +198,7 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera raise EmailSyntaxError("The part after the @-sign is not valid. It should have a period.") # We also know that all TLDs currently end with a letter. - if not re.search(r"[A-Za-z]\Z", ascii_domain): + if not DOMAIN_NAME_REGEX.search(ascii_domain): raise EmailSyntaxError("The part after the @-sign is not valid. It is not within a valid top-level domain.") # Check special-use and reserved domain names. From c782629b77fc5fb9b127c5f5bac6af7e49e8bf99 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sat, 21 Jan 2023 07:35:19 -0500 Subject: [PATCH 062/174] Don't include the domain name in exception message text before its syntax is validated The domain name might have unsafe characters. In any case, the user probably knows what they entered and doesn't need it repeated back to them. --- email_validator/syntax.py | 8 ++++---- tests/test_main.py | 12 ++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/email_validator/syntax.py b/email_validator/syntax.py index e028163..3c34e56 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -128,7 +128,7 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera try: domain = idna.uts46_remap(domain, std3_rules=False, transitional=False) except idna.IDNAError as e: - raise EmailSyntaxError("The domain name %s contains invalid characters (%s)." % (domain, str(e))) + raise EmailSyntaxError("The part after the @-sign contains invalid characters ({}).".format(str(e))) # Now we can perform basic checks on the use of periods (since equivalent # symbols have been mapped to periods). These checks are needed because the @@ -166,7 +166,7 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera # one the user supplied. Also I'm not sure if the length check applies # to the internationalized form, the IDNA ASCII form, or even both! raise EmailSyntaxError("The email address is too long after the @-sign.") - raise EmailSyntaxError("The domain name %s contains invalid characters (%s)." % (domain, str(e))) + raise EmailSyntaxError("The part after the @-sign contains invalid characters (%s)." % str(e)) # Check the syntax of the string returned by idna.encode. # It should never fail. @@ -186,7 +186,7 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera for label in ascii_domain.split("."): if len(label) > DNS_LABEL_LENGTH_LIMIT: reason = get_length_reason(label, limit=DNS_LABEL_LENGTH_LIMIT) - raise EmailSyntaxError("The part of the email address \"{}\" is too long {}.".format(label, reason)) + raise EmailSyntaxError("On either side of the @-sign, periods cannot be separated by so many characters {}.".format(reason)) if globally_deliverable: # All publicly deliverable addresses have domain named with at least @@ -223,7 +223,7 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera try: domain_i18n = idna.decode(ascii_domain.encode('ascii')) except idna.IDNAError as e: - raise EmailSyntaxError("The domain name %s is not valid IDNA (%s)." % (ascii_domain, str(e))) + raise EmailSyntaxError("The part after the @-sign is not valid IDNA ({}).".format(str(e))) # Return the IDNA ASCII-encoded form of the domain, which is how it # would be transmitted on the wire (except when used with SMTPUTF8 diff --git a/tests/test_main.py b/tests/test_main.py index 08b3c9f..963564a 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -218,22 +218,22 @@ def test_email_valid(email_input, output): ('my@..twodots.com', 'An email address cannot have a period immediately after the @-sign.'), ('my@twodots..com', 'An email address cannot have two periods in a row.'), ('my@baddash.-.com', - 'The domain name baddash.-.com contains invalid characters (Label must not start or end with a hyphen).'), + 'The part after the @-sign contains invalid characters (Label must not start or end with a hyphen).'), ('my@baddash.-a.com', - 'The domain name baddash.-a.com contains invalid characters (Label must not start or end with a hyphen).'), + 'The part after the @-sign contains invalid characters (Label must not start or end with a hyphen).'), ('my@baddash.b-.com', - 'The domain name baddash.b-.com contains invalid characters (Label must not start or end with a hyphen).'), + 'The part after the @-sign contains invalid characters (Label must not start or end with a hyphen).'), ('my@example.com\n', - 'The domain name example.com\n contains invalid characters (Codepoint U+000A at position 4 of ' + 'The part after the @-sign contains invalid characters (Codepoint U+000A at position 4 of ' '\'com\\n\' not allowed).'), ('my@example\n.com', - 'The domain name example\n.com contains invalid characters (Codepoint U+000A at position 8 of ' + 'The part after the @-sign contains invalid characters (Codepoint U+000A at position 8 of ' '\'example\\n\' not allowed).'), ('.leadingdot@domain.com', 'The email address contains invalid characters before the @-sign: FULL STOP.'), ('..twodots@domain.com', 'The email address contains invalid characters before the @-sign: FULL STOP.'), ('twodots..here@domain.com', 'The email address contains invalid characters before the @-sign: FULL STOP.'), ('me@⒈wouldbeinvalid.com', - "The domain name ⒈wouldbeinvalid.com contains invalid characters (Codepoint U+2488 not allowed " + "The part after the @-sign contains invalid characters (Codepoint U+2488 not allowed " "at position 1 in '⒈wouldbeinvalid.com')."), ('@example.com', 'There must be something before the @-sign.'), ('\nmy@example.com', 'The email address contains invalid characters before the @-sign: \'\\n\'.'), From 364d2d189e28b334437dcdbf554a484d07977a9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20G=C3=B3rny?= Date: Sun, 22 Jan 2023 14:44:12 +0100 Subject: [PATCH 063/174] Replace deprecated `license_file` in `setup.cfg` (#94) Replace the deprecated option with `license_files`, to fix the following deprecation warning: /usr/lib/python3.11/site-packages/setuptools/config/setupcfg.py:515: SetuptoolsDeprecationWarning: The license_file parameter is deprecated, use license_files instead. The new option is backwards compatible and is available since setuptools 42.0.0. --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 14a8941..6617e4d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -8,7 +8,7 @@ url = https://github.com/JoshData/python-email-validator author = Joshua Tauberer author_email = jt@occams.info license = CC0 (copyright waived) -license_file = LICENSE +license_files = LICENSE classifiers = Development Status :: 5 - Production/Stable Intended Audience :: Developers From e6e8f07b60225cadf50f6359260d0f5ac6a02fe9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Gu=C3=A9rin?= Date: Sat, 4 Feb 2023 19:41:10 +0100 Subject: [PATCH 064/174] Lazy load dns.resolver (#93) I usually `email_validator` through `pydantic` which doesn't check deliverability. The import of email_validator always trigger the import of `dns.resolver` which will try to import `httpx`, `requests`, etc. This import time can impact negatively the startup time of some apps. I propose here to postpone the import of `.deliverability` to the first time it is needed. --- email_validator/__init__.py | 11 +++++++++-- email_validator/validate_email.py | 4 +++- setup.cfg | 2 +- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/email_validator/__init__.py b/email_validator/__init__.py index 6a91fda..94ebcb6 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -1,9 +1,16 @@ # -*- coding: utf-8 -*- # Export the main method, helper methods, and the public data types. -from .validate_email import validate_email # noqa: F401 -from .deliverability import caching_resolver # noqa: F401 from .exceptions_types import * # noqa: F401,F403 +from .validate_email import validate_email # noqa: F401 + + +def caching_resolver(*args, **kwargs): + # Lazy load `deliverability` as it is slow to import (due to dns.resolver) + from .deliverability import caching_resolver + + return caching_resolver(*args, **kwargs) + # These global attributes are a part of the library's API and can be # changed by library users. diff --git a/email_validator/validate_email.py b/email_validator/validate_email.py index 8bca5fb..73e5ee7 100644 --- a/email_validator/validate_email.py +++ b/email_validator/validate_email.py @@ -1,6 +1,5 @@ from .exceptions_types import EmailSyntaxError, ValidatedEmail from .syntax import validate_email_local_part, validate_email_domain_part, get_length_reason -from .deliverability import validate_email_deliverability from .rfc_constants import EMAIL_MAX_LENGTH @@ -115,6 +114,9 @@ def validate_email( if check_deliverability and not test_environment: # Validate the email address's deliverability using DNS # and update the return dict with metadata. + + # Lazy load `deliverability` as it is slow to import (due to dns.resolver) + from .deliverability import validate_email_deliverability deliverability_info = validate_email_deliverability( ret["domain"], ret["domain_i18n"], timeout, dns_resolver ) diff --git a/setup.cfg b/setup.cfg index 6617e4d..e35bc55 100644 --- a/setup.cfg +++ b/setup.cfg @@ -24,7 +24,7 @@ keywords = email address validator [options] packages = find: install_requires = - dnspython>=2.0.0 + dnspython>=2.0.0 # optional if deliverability check isn't needed idna>=2.0.0 python_requires = >=3.6 From 7798028eac72ce7a8a6197d2324a9f9e1bc4c894 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Fri, 27 Jan 2023 10:41:48 -0500 Subject: [PATCH 065/174] Split the tests into smaller modules --- tests/test_deliverability.py | 88 ++++++ tests/test_main.py | 584 +---------------------------------- tests/test_syntax.py | 500 ++++++++++++++++++++++++++++++ 3 files changed, 589 insertions(+), 583 deletions(-) create mode 100644 tests/test_deliverability.py create mode 100644 tests/test_syntax.py diff --git a/tests/test_deliverability.py b/tests/test_deliverability.py new file mode 100644 index 0000000..e8a7cce --- /dev/null +++ b/tests/test_deliverability.py @@ -0,0 +1,88 @@ +import dns.resolver +import pytest +import re + +from email_validator import EmailUndeliverableError, \ + validate_email +from email_validator.deliverability import caching_resolver, validate_email_deliverability + + +def test_deliverability_found(): + response = validate_email_deliverability('gmail.com', 'gmail.com') + assert response.keys() == {'mx', 'mx_fallback_type'} + assert response['mx_fallback_type'] is None + assert len(response['mx']) > 1 + assert len(response['mx'][0]) == 2 + assert isinstance(response['mx'][0][0], int) + assert response['mx'][0][1].endswith('.com') + + +def test_deliverability_fails(): + # No MX record. + domain = 'xkxufoekjvjfjeodlfmdfjcu.com' + with pytest.raises(EmailUndeliverableError, match='The domain name {} does not exist'.format(domain)): + validate_email_deliverability(domain, domain) + + # Null MX record. + domain = 'example.com' + with pytest.raises(EmailUndeliverableError, match='The domain name {} does not accept email'.format(domain)): + validate_email_deliverability(domain, domain) + + +@pytest.mark.parametrize( + 'email_input', + [ + ('me@mail.example'), + ('me@example.com'), + ('me@mail.example.com'), + ], +) +def test_email_example_reserved_domain(email_input): + # Since these all fail deliverabiltiy from a static list, + # DNS deliverability checks do not arise. + with pytest.raises(EmailUndeliverableError) as exc_info: + validate_email(email_input) + # print(f'({email_input!r}, {str(exc_info.value)!r}),') + assert re.match(r"The domain name [a-z\.]+ does not (accept email|exist)\.", str(exc_info.value)) is not None + + +def test_deliverability_dns_timeout(): + validate_email_deliverability.TEST_CHECK_TIMEOUT = True + response = validate_email_deliverability('gmail.com', 'gmail.com') + assert "mx" not in response + assert response.get("unknown-deliverability") == "timeout" + validate_email('test@gmail.com') + del validate_email_deliverability.TEST_CHECK_TIMEOUT + + +def test_validate_email__with_caching_resolver(): + # unittest.mock.patch("dns.resolver.LRUCache.get") doesn't + # work --- it causes get to always return an empty list. + # So we'll mock our own way. + class MockedCache: + get_called = False + put_called = False + + def get(self, key): + self.get_called = True + return None + + def put(self, key, value): + self.put_called = True + + # Test with caching_resolver helper method. + mocked_cache = MockedCache() + dns_resolver = caching_resolver(cache=mocked_cache) + validate_email("test@gmail.com", dns_resolver=dns_resolver) + assert mocked_cache.put_called + validate_email("test@gmail.com", dns_resolver=dns_resolver) + assert mocked_cache.get_called + + # Test with dns.resolver.Resolver instance. + dns_resolver = dns.resolver.Resolver() + dns_resolver.lifetime = 10 + dns_resolver.cache = MockedCache() + validate_email("test@gmail.com", dns_resolver=dns_resolver) + assert mocked_cache.put_called + validate_email("test@gmail.com", dns_resolver=dns_resolver) + assert mocked_cache.get_called diff --git a/tests/test_main.py b/tests/test_main.py index 963564a..e087e2d 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -1,526 +1,8 @@ -import dns.resolver -import re -import pytest -from email_validator import EmailSyntaxError, EmailUndeliverableError, \ - validate_email, \ - ValidatedEmail -from email_validator.deliverability import caching_resolver, validate_email_deliverability +from email_validator import validate_email # Let's test main but rename it to be clear from email_validator.__main__ import main as validator_command_line_tool -@pytest.mark.parametrize( - 'email_input,output', - [ - ( - 'Abc@example.tld', - ValidatedEmail( - local_part='Abc', - ascii_local_part='Abc', - smtputf8=False, - ascii_domain='example.tld', - domain='example.tld', - email='Abc@example.tld', - ascii_email='Abc@example.tld', - ), - ), - ( - 'Abc.123@test-example.com', - ValidatedEmail( - local_part='Abc.123', - ascii_local_part='Abc.123', - smtputf8=False, - ascii_domain='test-example.com', - domain='test-example.com', - email='Abc.123@test-example.com', - ascii_email='Abc.123@test-example.com', - ), - ), - ( - 'user+mailbox/department=shipping@example.tld', - ValidatedEmail( - local_part='user+mailbox/department=shipping', - ascii_local_part='user+mailbox/department=shipping', - smtputf8=False, - ascii_domain='example.tld', - domain='example.tld', - email='user+mailbox/department=shipping@example.tld', - ascii_email='user+mailbox/department=shipping@example.tld', - ), - ), - ( - "!#$%&'*+-/=?^_`.{|}~@example.tld", - ValidatedEmail( - local_part="!#$%&'*+-/=?^_`.{|}~", - ascii_local_part="!#$%&'*+-/=?^_`.{|}~", - smtputf8=False, - ascii_domain='example.tld', - domain='example.tld', - email="!#$%&'*+-/=?^_`.{|}~@example.tld", - ascii_email="!#$%&'*+-/=?^_`.{|}~@example.tld", - ), - ), - ( - '伊昭傑@郵件.商務', - ValidatedEmail( - local_part='伊昭傑', - smtputf8=True, - ascii_domain='xn--5nqv22n.xn--lhr59c', - domain='郵件.商務', - email='伊昭傑@郵件.商務', - ), - ), - ( - 'राम@मोहन.ईन्फो', - ValidatedEmail( - local_part='राम', - smtputf8=True, - ascii_domain='xn--l2bl7a9d.xn--o1b8dj2ki', - domain='मोहन.ईन्फो', - email='राम@मोहन.ईन्फो', - ), - ), - ( - 'юзер@екзампл.ком', - ValidatedEmail( - local_part='юзер', - smtputf8=True, - ascii_domain='xn--80ajglhfv.xn--j1aef', - domain='екзампл.ком', - email='юзер@екзампл.ком', - ), - ), - ( - 'θσερ@εχαμπλε.ψομ', - ValidatedEmail( - local_part='θσερ', - smtputf8=True, - ascii_domain='xn--mxahbxey0c.xn--xxaf0a', - domain='εχαμπλε.ψομ', - email='θσερ@εχαμπλε.ψομ', - ), - ), - ( - '葉士豪@臺網中心.tw', - ValidatedEmail( - local_part='葉士豪', - smtputf8=True, - ascii_domain='xn--fiqq24b10vi0d.tw', - domain='臺網中心.tw', - email='葉士豪@臺網中心.tw', - ), - ), - ( - 'jeff@臺網中心.tw', - ValidatedEmail( - local_part='jeff', - ascii_local_part='jeff', - smtputf8=False, - ascii_domain='xn--fiqq24b10vi0d.tw', - domain='臺網中心.tw', - email='jeff@臺網中心.tw', - ascii_email='jeff@xn--fiqq24b10vi0d.tw', - ), - ), - ( - '葉士豪@臺網中心.台灣', - ValidatedEmail( - local_part='葉士豪', - smtputf8=True, - ascii_domain='xn--fiqq24b10vi0d.xn--kpry57d', - domain='臺網中心.台灣', - email='葉士豪@臺網中心.台灣', - ), - ), - ( - 'jeff葉@臺網中心.tw', - ValidatedEmail( - local_part='jeff葉', - smtputf8=True, - ascii_domain='xn--fiqq24b10vi0d.tw', - domain='臺網中心.tw', - email='jeff葉@臺網中心.tw', - ), - ), - ( - 'ñoñó@example.tld', - ValidatedEmail( - local_part='ñoñó', - smtputf8=True, - ascii_domain='example.tld', - domain='example.tld', - email='ñoñó@example.tld', - ), - ), - ( - '我買@example.tld', - ValidatedEmail( - local_part='我買', - smtputf8=True, - ascii_domain='example.tld', - domain='example.tld', - email='我買@example.tld', - ), - ), - ( - '甲斐黒川日本@example.tld', - ValidatedEmail( - local_part='甲斐黒川日本', - smtputf8=True, - ascii_domain='example.tld', - domain='example.tld', - email='甲斐黒川日本@example.tld', - ), - ), - ( - 'чебурашкаящик-с-апельсинами.рф@example.tld', - ValidatedEmail( - local_part='чебурашкаящик-с-апельсинами.рф', - smtputf8=True, - ascii_domain='example.tld', - domain='example.tld', - email='чебурашкаящик-с-апельсинами.рф@example.tld', - ), - ), - ( - 'उदाहरण.परीक्ष@domain.with.idn.tld', - ValidatedEmail( - local_part='उदाहरण.परीक्ष', - smtputf8=True, - ascii_domain='domain.with.idn.tld', - domain='domain.with.idn.tld', - email='उदाहरण.परीक्ष@domain.with.idn.tld', - ), - ), - ( - 'ιωάννης@εεττ.gr', - ValidatedEmail( - local_part='ιωάννης', - smtputf8=True, - ascii_domain='xn--qxaa9ba.gr', - domain='εεττ.gr', - email='ιωάννης@εεττ.gr', - ), - ), - ], -) -def test_email_valid(email_input, output): - # print(f'({email_input!r}, {validate_email(email_input, check_deliverability=False)!r}),') - assert validate_email(email_input, check_deliverability=False) == output - - -@pytest.mark.parametrize( - 'email_input,error_msg', - [ - ('my@localhost', 'The part after the @-sign is not valid. It should have a period.'), - ('my@.leadingdot.com', 'An email address cannot have a period immediately after the @-sign.'), - ('my@..leadingfwdot.com', 'An email address cannot have a period immediately after the @-sign.'), - ('my@..twodots.com', 'An email address cannot have a period immediately after the @-sign.'), - ('my@twodots..com', 'An email address cannot have two periods in a row.'), - ('my@baddash.-.com', - 'The part after the @-sign contains invalid characters (Label must not start or end with a hyphen).'), - ('my@baddash.-a.com', - 'The part after the @-sign contains invalid characters (Label must not start or end with a hyphen).'), - ('my@baddash.b-.com', - 'The part after the @-sign contains invalid characters (Label must not start or end with a hyphen).'), - ('my@example.com\n', - 'The part after the @-sign contains invalid characters (Codepoint U+000A at position 4 of ' - '\'com\\n\' not allowed).'), - ('my@example\n.com', - 'The part after the @-sign contains invalid characters (Codepoint U+000A at position 8 of ' - '\'example\\n\' not allowed).'), - ('.leadingdot@domain.com', 'The email address contains invalid characters before the @-sign: FULL STOP.'), - ('..twodots@domain.com', 'The email address contains invalid characters before the @-sign: FULL STOP.'), - ('twodots..here@domain.com', 'The email address contains invalid characters before the @-sign: FULL STOP.'), - ('me@⒈wouldbeinvalid.com', - "The part after the @-sign contains invalid characters (Codepoint U+2488 not allowed " - "at position 1 in '⒈wouldbeinvalid.com')."), - ('@example.com', 'There must be something before the @-sign.'), - ('\nmy@example.com', 'The email address contains invalid characters before the @-sign: \'\\n\'.'), - ('m\ny@example.com', 'The email address contains invalid characters before the @-sign: \'\\n\'.'), - ('my\n@example.com', 'The email address contains invalid characters before the @-sign: \'\\n\'.'), - ('11111111112222222222333333333344444444445555555555666666666677777@example.com', 'The email address is too long before the @-sign (1 character too many).'), - ('111111111122222222223333333333444444444455555555556666666666777777@example.com', 'The email address is too long before the @-sign (2 characters too many).'), - ('me@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.111111111122222222223333333333444444444455555555556.com', 'The email address is too long (4 characters too many).'), - ('me@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555566.com', 'The email address is too long after the @-sign (1 character too many).'), - ('my.long.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333333344444.info', 'The email address is too long (2 characters too many).'), - ('my.long.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333.info', 'The email address is too long (when converted to IDNA ASCII).'), - ('my.long.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (at least 1 character too many).'), - ('my.λong.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.111111111122222222223333333333444.info', 'The email address is too long (when encoded in bytes).'), - ('my.λong.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (at least 1 character too many).'), - ], -) -def test_email_invalid_syntax(email_input, error_msg): - # Since these all have syntax errors, deliverability - # checks do not arise. - with pytest.raises(EmailSyntaxError) as exc_info: - validate_email(email_input) - # print(f'({email_input!r}, {str(exc_info.value)!r}),') - assert str(exc_info.value) == error_msg - - -@pytest.mark.parametrize( - 'email_input', - [ - ('me@anything.arpa'), - ('me@valid.invalid'), - ('me@link.local'), - ('me@host.localhost'), - ('me@onion.onion.onion'), - ('me@test.test.test'), - ], -) -def test_email_invalid_reserved_domain(email_input): - # Since these all fail deliverabiltiy from a static list, - # DNS deliverability checks do not arise. - with pytest.raises(EmailSyntaxError) as exc_info: - validate_email(email_input) - # print(f'({email_input!r}, {str(exc_info.value)!r}),') - assert "is a special-use or reserved name" in str(exc_info.value) - - -@pytest.mark.parametrize( - 'email_input', - [ - ('me@mail.example'), - ('me@example.com'), - ('me@mail.example.com'), - ], -) -def test_email_example_reserved_domain(email_input): - # Since these all fail deliverabiltiy from a static list, - # DNS deliverability checks do not arise. - with pytest.raises(EmailUndeliverableError) as exc_info: - validate_email(email_input) - # print(f'({email_input!r}, {str(exc_info.value)!r}),') - assert re.match(r"The domain name [a-z\.]+ does not (accept email|exist)\.", str(exc_info.value)) is not None - - -@pytest.mark.parametrize( - 'email_input', - [ - ('white space@test'), - ('\n@test'), - ('\u2005@test'), # four-per-em space (Zs) - ('\u009C@test'), # string terminator (Cc) - ('\u200B@test'), # zero-width space (Cf) - ('\u202Dforward-\u202Ereversed@test'), # BIDI (Cf) - ('\uD800@test'), # surrogate (Cs) - ('\uE000@test'), # private use (Co) - ('\uFDEF@test'), # unassigned (Cn) - ], -) -def test_email_unsafe_character(email_input): - # Check for various unsafe characters: - with pytest.raises(EmailSyntaxError) as exc_info: - validate_email(email_input, test_environment=True) - assert "invalid character" in str(exc_info.value) - - -def test_email_test_domain_name_in_test_environment(): - validate_email("anything@test", test_environment=True) - validate_email("anything@mycompany.test", test_environment=True) - - -# This is the pyIsEmail (https://github.com/michaelherold/pyIsEmail) test suite. -# -# The test data was extracted by: -# -# $ wget https://raw.githubusercontent.com/michaelherold/pyIsEmail/master/tests/data/tests.xml -# $ xmllint --xpath '/tests/test/address/text()' tests.xml > t1 -# $ xmllint --xpath "/tests/test[not(address='')]/diagnosis/text()" tests.xml > t2 -# -# tests = [] -# def fixup_char(c): -# if ord(c) >= 0x2400 and ord(c) <= 0x2432: -# c = chr(ord(c)-0x2400) -# return c -# for email, diagnosis in zip(open("t1"), open("t2")): -# email = email[:-1] # strip trailing \n but not more because trailing whitespace is significant -# email = "".join(fixup_char(c) for c in email).replace("&", "&") -# tests.append([email, diagnosis.strip()]) -# print(repr(tests).replace("'], ['", "'],\n['")) -@pytest.mark.parametrize( - ('email_input', 'status'), - [ - ['test', 'ISEMAIL_ERR_NODOMAIN'], - ['@', 'ISEMAIL_ERR_NOLOCALPART'], - ['test@', 'ISEMAIL_ERR_NODOMAIN'], - # ['test@io', 'ISEMAIL_VALID'], # we reject domains without a dot, knowing they are not deliverable - ['@io', 'ISEMAIL_ERR_NOLOCALPART'], - ['@iana.org', 'ISEMAIL_ERR_NOLOCALPART'], - ['test@iana.org', 'ISEMAIL_VALID'], - ['test@nominet.org.uk', 'ISEMAIL_VALID'], - ['test@about.museum', 'ISEMAIL_VALID'], - ['a@iana.org', 'ISEMAIL_VALID'], - ['test.test@iana.org', 'ISEMAIL_VALID'], - ['.test@iana.org', 'ISEMAIL_ERR_DOT_START'], - ['test.@iana.org', 'ISEMAIL_ERR_DOT_END'], - ['test..iana.org', 'ISEMAIL_ERR_CONSECUTIVEDOTS'], - ['test_exa-mple.com', 'ISEMAIL_ERR_NODOMAIN'], - ['!#$%&`*+/=?^`{|}~@iana.org', 'ISEMAIL_VALID'], - ['test\\@test@iana.org', 'ISEMAIL_ERR_EXPECTING_ATEXT'], - ['123@iana.org', 'ISEMAIL_VALID'], - ['test@123.com', 'ISEMAIL_VALID'], - ['abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghiklm@iana.org', 'ISEMAIL_VALID'], - ['abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghiklmn@iana.org', 'ISEMAIL_RFC5322_LOCAL_TOOLONG'], - ['test@abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghiklm.com', 'ISEMAIL_RFC5322_LABEL_TOOLONG'], - ['test@mason-dixon.com', 'ISEMAIL_VALID'], - ['test@-iana.org', 'ISEMAIL_ERR_DOMAINHYPHENSTART'], - ['test@iana-.com', 'ISEMAIL_ERR_DOMAINHYPHENEND'], - ['test@g--a.com', 'ISEMAIL_VALID'], - ['test@.iana.org', 'ISEMAIL_ERR_DOT_START'], - ['test@iana.org.', 'ISEMAIL_ERR_DOT_END'], - ['test@iana..com', 'ISEMAIL_ERR_CONSECUTIVEDOTS'], - ['abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghiklm@abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghikl.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghikl.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghij', 'ISEMAIL_RFC5322_TOOLONG'], - ['a@abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghikl.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghikl.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghikl.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefg.hij', 'ISEMAIL_RFC5322_TOOLONG'], - ['a@abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghikl.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghikl.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghikl.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefg.hijk', 'ISEMAIL_RFC5322_DOMAIN_TOOLONG'], - ['"test"@iana.org', 'ISEMAIL_RFC5321_QUOTEDSTRING'], - ['""@iana.org', 'ISEMAIL_RFC5321_QUOTEDSTRING'], - ['"""@iana.org', 'ISEMAIL_ERR_EXPECTING_ATEXT'], - ['"\\a"@iana.org', 'ISEMAIL_RFC5321_QUOTEDSTRING'], - ['"\\""@iana.org', 'ISEMAIL_RFC5321_QUOTEDSTRING'], - ['"\\"@iana.org', 'ISEMAIL_ERR_UNCLOSEDQUOTEDSTR'], - ['"\\\\"@iana.org', 'ISEMAIL_RFC5321_QUOTEDSTRING'], - ['test"@iana.org', 'ISEMAIL_ERR_EXPECTING_ATEXT'], - ['"test@iana.org', 'ISEMAIL_ERR_UNCLOSEDQUOTEDSTR'], - ['"test"test@iana.org', 'ISEMAIL_ERR_ATEXT_AFTER_QS'], - ['test"text"@iana.org', 'ISEMAIL_ERR_EXPECTING_ATEXT'], - ['"test""test"@iana.org', 'ISEMAIL_ERR_EXPECTING_ATEXT'], - ['"test"."test"@iana.org', 'ISEMAIL_DEPREC_LOCALPART'], - ['"test\\ test"@iana.org', 'ISEMAIL_RFC5321_QUOTEDSTRING'], - ['"test".test@iana.org', 'ISEMAIL_DEPREC_LOCALPART'], - ['"test\x00"@iana.org', 'ISEMAIL_ERR_EXPECTING_QTEXT'], - ['"test\\\x00"@iana.org', 'ISEMAIL_DEPREC_QP'], - ['"abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz abcdefghj"@iana.org', 'ISEMAIL_RFC5322_LOCAL_TOOLONG'], - ['"abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz abcdefg\\h"@iana.org', 'ISEMAIL_RFC5322_LOCAL_TOOLONG'], - ['test@[255.255.255.255]', 'ISEMAIL_RFC5321_ADDRESSLITERAL'], - ['test@a[255.255.255.255]', 'ISEMAIL_ERR_EXPECTING_ATEXT'], - ['test@[255.255.255]', 'ISEMAIL_RFC5322_DOMAINLITERAL'], - ['test@[255.255.255.255.255]', 'ISEMAIL_RFC5322_DOMAINLITERAL'], - ['test@[255.255.255.256]', 'ISEMAIL_RFC5322_DOMAINLITERAL'], - ['test@[1111:2222:3333:4444:5555:6666:7777:8888]', 'ISEMAIL_RFC5322_DOMAINLITERAL'], - ['test@[IPv6:1111:2222:3333:4444:5555:6666:7777]', 'ISEMAIL_RFC5322_IPV6_GRPCOUNT'], - ['test@[IPv6:1111:2222:3333:4444:5555:6666:7777:8888]', 'ISEMAIL_RFC5321_ADDRESSLITERAL'], - ['test@[IPv6:1111:2222:3333:4444:5555:6666:7777:8888:9999]', 'ISEMAIL_RFC5322_IPV6_GRPCOUNT'], - ['test@[IPv6:1111:2222:3333:4444:5555:6666:7777:888G]', 'ISEMAIL_RFC5322_IPV6_BADCHAR'], - ['test@[IPv6:1111:2222:3333:4444:5555:6666::8888]', 'ISEMAIL_RFC5321_IPV6DEPRECATED'], - ['test@[IPv6:1111:2222:3333:4444:5555::8888]', 'ISEMAIL_RFC5321_ADDRESSLITERAL'], - ['test@[IPv6:1111:2222:3333:4444:5555:6666::7777:8888]', 'ISEMAIL_RFC5322_IPV6_MAXGRPS'], - ['test@[IPv6::3333:4444:5555:6666:7777:8888]', 'ISEMAIL_RFC5322_IPV6_COLONSTRT'], - ['test@[IPv6:::3333:4444:5555:6666:7777:8888]', 'ISEMAIL_RFC5321_ADDRESSLITERAL'], - ['test@[IPv6:1111::4444:5555::8888]', 'ISEMAIL_RFC5322_IPV6_2X2XCOLON'], - ['test@[IPv6:::]', 'ISEMAIL_RFC5321_ADDRESSLITERAL'], - ['test@[IPv6:1111:2222:3333:4444:5555:255.255.255.255]', 'ISEMAIL_RFC5322_IPV6_GRPCOUNT'], - ['test@[IPv6:1111:2222:3333:4444:5555:6666:255.255.255.255]', 'ISEMAIL_RFC5321_ADDRESSLITERAL'], - ['test@[IPv6:1111:2222:3333:4444:5555:6666:7777:255.255.255.255]', 'ISEMAIL_RFC5322_IPV6_GRPCOUNT'], - ['test@[IPv6:1111:2222:3333:4444::255.255.255.255]', 'ISEMAIL_RFC5321_ADDRESSLITERAL'], - ['test@[IPv6:1111:2222:3333:4444:5555:6666::255.255.255.255]', 'ISEMAIL_RFC5322_IPV6_MAXGRPS'], - ['test@[IPv6:1111:2222:3333:4444:::255.255.255.255]', 'ISEMAIL_RFC5322_IPV6_2X2XCOLON'], - ['test@[IPv6::255.255.255.255]', 'ISEMAIL_RFC5322_IPV6_COLONSTRT'], - [' test @iana.org', 'ISEMAIL_DEPREC_CFWS_NEAR_AT'], - ['test@ iana .com', 'ISEMAIL_DEPREC_CFWS_NEAR_AT'], - ['test . test@iana.org', 'ISEMAIL_DEPREC_FWS'], - ['\r\n test@iana.org', 'ISEMAIL_CFWS_FWS'], - ['\r\n \r\n test@iana.org', 'ISEMAIL_DEPREC_FWS'], - ['(comment)test@iana.org', 'ISEMAIL_CFWS_COMMENT'], - ['((comment)test@iana.org', 'ISEMAIL_ERR_UNCLOSEDCOMMENT'], - ['(comment(comment))test@iana.org', 'ISEMAIL_CFWS_COMMENT'], - ['test@(comment)iana.org', 'ISEMAIL_DEPREC_CFWS_NEAR_AT'], - ['test(comment)test@iana.org', 'ISEMAIL_ERR_ATEXT_AFTER_CFWS'], - ['test@(comment)[255.255.255.255]', 'ISEMAIL_DEPREC_CFWS_NEAR_AT'], - ['(comment)abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghiklm@iana.org', 'ISEMAIL_CFWS_COMMENT'], - ['test@(comment)abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghikl.com', 'ISEMAIL_DEPREC_CFWS_NEAR_AT'], - ['(comment)test@abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghik.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghik.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijk.abcdefghijklmnopqrstuvwxyzabcdefghijk.abcdefghijklmnopqrstu', 'ISEMAIL_CFWS_COMMENT'], - ['test@iana.org\n', 'ISEMAIL_ERR_EXPECTING_ATEXT'], - ['test@xn--hxajbheg2az3al.xn--jxalpdlp', 'ISEMAIL_VALID'], - ['xn--test@iana.org', 'ISEMAIL_VALID'], - ['test@iana.org-', 'ISEMAIL_ERR_DOMAINHYPHENEND'], - ['"test@iana.org', 'ISEMAIL_ERR_UNCLOSEDQUOTEDSTR'], - ['(test@iana.org', 'ISEMAIL_ERR_UNCLOSEDCOMMENT'], - ['test@(iana.org', 'ISEMAIL_ERR_UNCLOSEDCOMMENT'], - ['test@[1.2.3.4', 'ISEMAIL_ERR_UNCLOSEDDOMLIT'], - ['"test\\"@iana.org', 'ISEMAIL_ERR_UNCLOSEDQUOTEDSTR'], - ['(comment\\)test@iana.org', 'ISEMAIL_ERR_UNCLOSEDCOMMENT'], - ['test@iana.org(comment\\)', 'ISEMAIL_ERR_UNCLOSEDCOMMENT'], - ['test@iana.org(comment\\', 'ISEMAIL_ERR_BACKSLASHEND'], - ['test@[RFC-5322-domain-literal]', 'ISEMAIL_RFC5322_DOMAINLITERAL'], - ['test@[RFC-5322]-domain-literal]', 'ISEMAIL_ERR_ATEXT_AFTER_DOMLIT'], - ['test@[RFC-5322-[domain-literal]', 'ISEMAIL_ERR_EXPECTING_DTEXT'], - ['test@[RFC-5322-\\\x07-domain-literal]', 'ISEMAIL_RFC5322_DOMLIT_OBSDTEXT'], - ['test@[RFC-5322-\\\t-domain-literal]', 'ISEMAIL_RFC5322_DOMLIT_OBSDTEXT'], - ['test@[RFC-5322-\\]-domain-literal]', 'ISEMAIL_RFC5322_DOMLIT_OBSDTEXT'], - ['test@[RFC-5322-domain-literal\\]', 'ISEMAIL_ERR_UNCLOSEDDOMLIT'], - ['test@[RFC-5322-domain-literal\\', 'ISEMAIL_ERR_BACKSLASHEND'], - ['test@[RFC 5322 domain literal]', 'ISEMAIL_RFC5322_DOMAINLITERAL'], - ['test@[RFC-5322-domain-literal] (comment)', 'ISEMAIL_RFC5322_DOMAINLITERAL'], - ['\x7f@iana.org', 'ISEMAIL_ERR_EXPECTING_ATEXT'], - ['test@\x7f.org', 'ISEMAIL_ERR_EXPECTING_ATEXT'], - ['"\x7f"@iana.org', 'ISEMAIL_DEPREC_QTEXT'], - ['"\\\x7f"@iana.org', 'ISEMAIL_DEPREC_QP'], - ['(\x7f)test@iana.org', 'ISEMAIL_DEPREC_CTEXT'], - ['test@iana.org\r', 'ISEMAIL_ERR_CR_NO_LF'], - ['\rtest@iana.org', 'ISEMAIL_ERR_CR_NO_LF'], - ['"\rtest"@iana.org', 'ISEMAIL_ERR_CR_NO_LF'], - ['(\r)test@iana.org', 'ISEMAIL_ERR_CR_NO_LF'], - ['test@iana.org(\r)', 'ISEMAIL_ERR_CR_NO_LF'], - ['\ntest@iana.org', 'ISEMAIL_ERR_EXPECTING_ATEXT'], - ['"\n"@iana.org', 'ISEMAIL_ERR_EXPECTING_QTEXT'], - ['"\\\n"@iana.org', 'ISEMAIL_DEPREC_QP'], - ['(\n)test@iana.org', 'ISEMAIL_ERR_EXPECTING_CTEXT'], - ['\x07@iana.org', 'ISEMAIL_ERR_EXPECTING_ATEXT'], - ['test@\x07.org', 'ISEMAIL_ERR_EXPECTING_ATEXT'], - ['"\x07"@iana.org', 'ISEMAIL_DEPREC_QTEXT'], - ['"\\\x07"@iana.org', 'ISEMAIL_DEPREC_QP'], - ['(\x07)test@iana.org', 'ISEMAIL_DEPREC_CTEXT'], - ['\r\ntest@iana.org', 'ISEMAIL_ERR_FWS_CRLF_END'], - ['\r\n \r\ntest@iana.org', 'ISEMAIL_ERR_FWS_CRLF_END'], - [' \r\ntest@iana.org', 'ISEMAIL_ERR_FWS_CRLF_END'], - [' \r\n test@iana.org', 'ISEMAIL_CFWS_FWS'], - [' \r\n \r\ntest@iana.org', 'ISEMAIL_ERR_FWS_CRLF_END'], - [' \r\n\r\ntest@iana.org', 'ISEMAIL_ERR_FWS_CRLF_X2'], - [' \r\n\r\n test@iana.org', 'ISEMAIL_ERR_FWS_CRLF_X2'], - ['test@iana.org\r\n ', 'ISEMAIL_CFWS_FWS'], - ['test@iana.org\r\n \r\n ', 'ISEMAIL_DEPREC_FWS'], - ['test@iana.org\r\n', 'ISEMAIL_ERR_FWS_CRLF_END'], - ['test@iana.org\r\n \r\n', 'ISEMAIL_ERR_FWS_CRLF_END'], - ['test@iana.org \r\n', 'ISEMAIL_ERR_FWS_CRLF_END'], - ['test@iana.org \r\n ', 'ISEMAIL_CFWS_FWS'], - ['test@iana.org \r\n \r\n', 'ISEMAIL_ERR_FWS_CRLF_END'], - ['test@iana.org \r\n\r\n', 'ISEMAIL_ERR_FWS_CRLF_X2'], - ['test@iana.org \r\n\r\n ', 'ISEMAIL_ERR_FWS_CRLF_X2'], - [' test@iana.org', 'ISEMAIL_CFWS_FWS'], - ['test@iana.org ', 'ISEMAIL_CFWS_FWS'], - ['test@[IPv6:1::2:]', 'ISEMAIL_RFC5322_IPV6_COLONEND'], - ['"test\\©"@iana.org', 'ISEMAIL_ERR_EXPECTING_QPAIR'], - ['test@iana/icann.org', 'ISEMAIL_RFC5322_DOMAIN'], - ['test.(comment)test@iana.org', 'ISEMAIL_DEPREC_COMMENT'] - ] -) -def test_pyisemail_tests(email_input, status): - if status == "ISEMAIL_VALID": - # All standard email address forms should not raise an exception. - validate_email(email_input, test_environment=True) - elif "_ERR_" in status or "_TOOLONG" in status \ - or "_CFWS_FWS" in status or "_CFWS_COMMENT" in status \ - or "_IPV6" in status or status == "ISEMAIL_RFC5322_DOMAIN": - # Invalid syntax, extranous whitespace, and "(comments)" should be rejected. - # The _IPV6_ diagnoses appear to represent syntactically invalid domain literals. - # The ISEMAIL_RFC5322_DOMAIN diagnosis appears to be a syntactically invalid domain. - with pytest.raises(EmailSyntaxError): - validate_email(email_input, test_environment=True) - elif "_DEPREC_" in status \ - or "RFC5321_QUOTEDSTRING" in status \ - or "DOMAINLITERAL" in status or "_DOMLIT_" in status or "_ADDRESSLITERAL" in status: - # Quoted strings in the local part, domain literals (IP addresses in brackets), - # and other deprecated syntax are valid email addresses and are accepted by pyIsEmail, - # but we reject them. - with pytest.raises(EmailSyntaxError): - validate_email(email_input, test_environment=True) - else: - raise ValueError("status {} is not recognized".format(status)) - - def test_dict_accessor(): input_email = "testaddr@example.tld" valid_email = validate_email(input_email, check_deliverability=False) @@ -528,37 +10,6 @@ def test_dict_accessor(): assert valid_email.as_dict()["original_email"] == input_email -def test_deliverability_found(): - response = validate_email_deliverability('gmail.com', 'gmail.com') - assert response.keys() == {'mx', 'mx_fallback_type'} - assert response['mx_fallback_type'] is None - assert len(response['mx']) > 1 - assert len(response['mx'][0]) == 2 - assert isinstance(response['mx'][0][0], int) - assert response['mx'][0][1].endswith('.com') - - -def test_deliverability_fails(): - # No MX record. - domain = 'xkxufoekjvjfjeodlfmdfjcu.com' - with pytest.raises(EmailUndeliverableError, match='The domain name {} does not exist'.format(domain)): - validate_email_deliverability(domain, domain) - - # Null MX record. - domain = 'example.com' - with pytest.raises(EmailUndeliverableError, match='The domain name {} does not accept email'.format(domain)): - validate_email_deliverability(domain, domain) - - -def test_deliverability_dns_timeout(): - validate_email_deliverability.TEST_CHECK_TIMEOUT = True - response = validate_email_deliverability('gmail.com', 'gmail.com') - assert "mx" not in response - assert response.get("unknown-deliverability") == "timeout" - validate_email('test@gmail.com') - del validate_email_deliverability.TEST_CHECK_TIMEOUT - - def test_main_single_good_input(monkeypatch, capsys): import json test_email = "google@google.com" @@ -590,36 +41,3 @@ def test_main_multi_input(monkeypatch, capsys): assert test_cases[1] not in stdout assert test_cases[2] in stdout assert test_cases[3] in stdout - - -def test_validate_email__with_caching_resolver(): - # unittest.mock.patch("dns.resolver.LRUCache.get") doesn't - # work --- it causes get to always return an empty list. - # So we'll mock our own way. - class MockedCache: - get_called = False - put_called = False - - def get(self, key): - self.get_called = True - return None - - def put(self, key, value): - self.put_called = True - - # Test with caching_resolver helper method. - mocked_cache = MockedCache() - dns_resolver = caching_resolver(cache=mocked_cache) - validate_email("test@gmail.com", dns_resolver=dns_resolver) - assert mocked_cache.put_called - validate_email("test@gmail.com", dns_resolver=dns_resolver) - assert mocked_cache.get_called - - # Test with dns.resolver.Resolver instance. - dns_resolver = dns.resolver.Resolver() - dns_resolver.lifetime = 10 - dns_resolver.cache = MockedCache() - validate_email("test@gmail.com", dns_resolver=dns_resolver) - assert mocked_cache.put_called - validate_email("test@gmail.com", dns_resolver=dns_resolver) - assert mocked_cache.get_called diff --git a/tests/test_syntax.py b/tests/test_syntax.py new file mode 100644 index 0000000..41042d5 --- /dev/null +++ b/tests/test_syntax.py @@ -0,0 +1,500 @@ +import pytest + +from email_validator import EmailSyntaxError, \ + validate_email, \ + ValidatedEmail + + +@pytest.mark.parametrize( + 'email_input,output', + [ + ( + 'Abc@example.tld', + ValidatedEmail( + local_part='Abc', + ascii_local_part='Abc', + smtputf8=False, + ascii_domain='example.tld', + domain='example.tld', + email='Abc@example.tld', + ascii_email='Abc@example.tld', + ), + ), + ( + 'Abc.123@test-example.com', + ValidatedEmail( + local_part='Abc.123', + ascii_local_part='Abc.123', + smtputf8=False, + ascii_domain='test-example.com', + domain='test-example.com', + email='Abc.123@test-example.com', + ascii_email='Abc.123@test-example.com', + ), + ), + ( + 'user+mailbox/department=shipping@example.tld', + ValidatedEmail( + local_part='user+mailbox/department=shipping', + ascii_local_part='user+mailbox/department=shipping', + smtputf8=False, + ascii_domain='example.tld', + domain='example.tld', + email='user+mailbox/department=shipping@example.tld', + ascii_email='user+mailbox/department=shipping@example.tld', + ), + ), + ( + "!#$%&'*+-/=?^_`.{|}~@example.tld", + ValidatedEmail( + local_part="!#$%&'*+-/=?^_`.{|}~", + ascii_local_part="!#$%&'*+-/=?^_`.{|}~", + smtputf8=False, + ascii_domain='example.tld', + domain='example.tld', + email="!#$%&'*+-/=?^_`.{|}~@example.tld", + ascii_email="!#$%&'*+-/=?^_`.{|}~@example.tld", + ), + ), + ( + '伊昭傑@郵件.商務', + ValidatedEmail( + local_part='伊昭傑', + smtputf8=True, + ascii_domain='xn--5nqv22n.xn--lhr59c', + domain='郵件.商務', + email='伊昭傑@郵件.商務', + ), + ), + ( + 'राम@मोहन.ईन्फो', + ValidatedEmail( + local_part='राम', + smtputf8=True, + ascii_domain='xn--l2bl7a9d.xn--o1b8dj2ki', + domain='मोहन.ईन्फो', + email='राम@मोहन.ईन्फो', + ), + ), + ( + 'юзер@екзампл.ком', + ValidatedEmail( + local_part='юзер', + smtputf8=True, + ascii_domain='xn--80ajglhfv.xn--j1aef', + domain='екзампл.ком', + email='юзер@екзампл.ком', + ), + ), + ( + 'θσερ@εχαμπλε.ψομ', + ValidatedEmail( + local_part='θσερ', + smtputf8=True, + ascii_domain='xn--mxahbxey0c.xn--xxaf0a', + domain='εχαμπλε.ψομ', + email='θσερ@εχαμπλε.ψομ', + ), + ), + ( + '葉士豪@臺網中心.tw', + ValidatedEmail( + local_part='葉士豪', + smtputf8=True, + ascii_domain='xn--fiqq24b10vi0d.tw', + domain='臺網中心.tw', + email='葉士豪@臺網中心.tw', + ), + ), + ( + 'jeff@臺網中心.tw', + ValidatedEmail( + local_part='jeff', + ascii_local_part='jeff', + smtputf8=False, + ascii_domain='xn--fiqq24b10vi0d.tw', + domain='臺網中心.tw', + email='jeff@臺網中心.tw', + ascii_email='jeff@xn--fiqq24b10vi0d.tw', + ), + ), + ( + '葉士豪@臺網中心.台灣', + ValidatedEmail( + local_part='葉士豪', + smtputf8=True, + ascii_domain='xn--fiqq24b10vi0d.xn--kpry57d', + domain='臺網中心.台灣', + email='葉士豪@臺網中心.台灣', + ), + ), + ( + 'jeff葉@臺網中心.tw', + ValidatedEmail( + local_part='jeff葉', + smtputf8=True, + ascii_domain='xn--fiqq24b10vi0d.tw', + domain='臺網中心.tw', + email='jeff葉@臺網中心.tw', + ), + ), + ( + 'ñoñó@example.tld', + ValidatedEmail( + local_part='ñoñó', + smtputf8=True, + ascii_domain='example.tld', + domain='example.tld', + email='ñoñó@example.tld', + ), + ), + ( + '我買@example.tld', + ValidatedEmail( + local_part='我買', + smtputf8=True, + ascii_domain='example.tld', + domain='example.tld', + email='我買@example.tld', + ), + ), + ( + '甲斐黒川日本@example.tld', + ValidatedEmail( + local_part='甲斐黒川日本', + smtputf8=True, + ascii_domain='example.tld', + domain='example.tld', + email='甲斐黒川日本@example.tld', + ), + ), + ( + 'чебурашкаящик-с-апельсинами.рф@example.tld', + ValidatedEmail( + local_part='чебурашкаящик-с-апельсинами.рф', + smtputf8=True, + ascii_domain='example.tld', + domain='example.tld', + email='чебурашкаящик-с-апельсинами.рф@example.tld', + ), + ), + ( + 'उदाहरण.परीक्ष@domain.with.idn.tld', + ValidatedEmail( + local_part='उदाहरण.परीक्ष', + smtputf8=True, + ascii_domain='domain.with.idn.tld', + domain='domain.with.idn.tld', + email='उदाहरण.परीक्ष@domain.with.idn.tld', + ), + ), + ( + 'ιωάννης@εεττ.gr', + ValidatedEmail( + local_part='ιωάννης', + smtputf8=True, + ascii_domain='xn--qxaa9ba.gr', + domain='εεττ.gr', + email='ιωάννης@εεττ.gr', + ), + ), + ], +) +def test_email_valid(email_input, output): + # print(f'({email_input!r}, {validate_email(email_input, check_deliverability=False)!r}),') + assert validate_email(email_input, check_deliverability=False) == output + + +@pytest.mark.parametrize( + 'email_input,error_msg', + [ + ('my@localhost', 'The part after the @-sign is not valid. It should have a period.'), + ('my@.leadingdot.com', 'An email address cannot have a period immediately after the @-sign.'), + ('my@..leadingfwdot.com', 'An email address cannot have a period immediately after the @-sign.'), + ('my@..twodots.com', 'An email address cannot have a period immediately after the @-sign.'), + ('my@twodots..com', 'An email address cannot have two periods in a row.'), + ('my@baddash.-.com', + 'The part after the @-sign contains invalid characters (Label must not start or end with a hyphen).'), + ('my@baddash.-a.com', + 'The part after the @-sign contains invalid characters (Label must not start or end with a hyphen).'), + ('my@baddash.b-.com', + 'The part after the @-sign contains invalid characters (Label must not start or end with a hyphen).'), + ('my@example.com\n', + 'The part after the @-sign contains invalid characters (Codepoint U+000A at position 4 of ' + '\'com\\n\' not allowed).'), + ('my@example\n.com', + 'The part after the @-sign contains invalid characters (Codepoint U+000A at position 8 of ' + '\'example\\n\' not allowed).'), + ('.leadingdot@domain.com', 'The email address contains invalid characters before the @-sign: FULL STOP.'), + ('..twodots@domain.com', 'The email address contains invalid characters before the @-sign: FULL STOP.'), + ('twodots..here@domain.com', 'The email address contains invalid characters before the @-sign: FULL STOP.'), + ('me@⒈wouldbeinvalid.com', + "The part after the @-sign contains invalid characters (Codepoint U+2488 not allowed " + "at position 1 in '⒈wouldbeinvalid.com')."), + ('@example.com', 'There must be something before the @-sign.'), + ('\nmy@example.com', 'The email address contains invalid characters before the @-sign: \'\\n\'.'), + ('m\ny@example.com', 'The email address contains invalid characters before the @-sign: \'\\n\'.'), + ('my\n@example.com', 'The email address contains invalid characters before the @-sign: \'\\n\'.'), + ('11111111112222222222333333333344444444445555555555666666666677777@example.com', 'The email address is too long before the @-sign (1 character too many).'), + ('111111111122222222223333333333444444444455555555556666666666777777@example.com', 'The email address is too long before the @-sign (2 characters too many).'), + ('me@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.111111111122222222223333333333444444444455555555556.com', 'The email address is too long (4 characters too many).'), + ('me@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555566.com', 'The email address is too long after the @-sign (1 character too many).'), + ('my.long.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333333344444.info', 'The email address is too long (2 characters too many).'), + ('my.long.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333.info', 'The email address is too long (when converted to IDNA ASCII).'), + ('my.long.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (at least 1 character too many).'), + ('my.λong.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.111111111122222222223333333333444.info', 'The email address is too long (when encoded in bytes).'), + ('my.λong.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (at least 1 character too many).'), + ], +) +def test_email_invalid_syntax(email_input, error_msg): + # Since these all have syntax errors, deliverability + # checks do not arise. + with pytest.raises(EmailSyntaxError) as exc_info: + validate_email(email_input) + # print(f'({email_input!r}, {str(exc_info.value)!r}),') + assert str(exc_info.value) == error_msg + + +@pytest.mark.parametrize( + 'email_input', + [ + ('me@anything.arpa'), + ('me@valid.invalid'), + ('me@link.local'), + ('me@host.localhost'), + ('me@onion.onion.onion'), + ('me@test.test.test'), + ], +) +def test_email_invalid_reserved_domain(email_input): + # Since these all fail deliverabiltiy from a static list, + # DNS deliverability checks do not arise. + with pytest.raises(EmailSyntaxError) as exc_info: + validate_email(email_input) + # print(f'({email_input!r}, {str(exc_info.value)!r}),') + assert "is a special-use or reserved name" in str(exc_info.value) + + +@pytest.mark.parametrize( + 'email_input', + [ + ('white space@test'), + ('\n@test'), + ('\u2005@test'), # four-per-em space (Zs) + ('\u009C@test'), # string terminator (Cc) + ('\u200B@test'), # zero-width space (Cf) + ('\u202Dforward-\u202Ereversed@test'), # BIDI (Cf) + ('\uD800@test'), # surrogate (Cs) + ('\uE000@test'), # private use (Co) + ('\uFDEF@test'), # unassigned (Cn) + ], +) +def test_email_unsafe_character(email_input): + # Check for various unsafe characters: + with pytest.raises(EmailSyntaxError) as exc_info: + validate_email(email_input, test_environment=True) + assert "invalid character" in str(exc_info.value) + + +def test_email_test_domain_name_in_test_environment(): + validate_email("anything@test", test_environment=True) + validate_email("anything@mycompany.test", test_environment=True) + + +# This is the pyIsEmail (https://github.com/michaelherold/pyIsEmail) test suite. +# +# The test data was extracted by: +# +# $ wget https://raw.githubusercontent.com/michaelherold/pyIsEmail/master/tests/data/tests.xml +# $ xmllint --xpath '/tests/test/address/text()' tests.xml > t1 +# $ xmllint --xpath "/tests/test[not(address='')]/diagnosis/text()" tests.xml > t2 +# +# tests = [] +# def fixup_char(c): +# if ord(c) >= 0x2400 and ord(c) <= 0x2432: +# c = chr(ord(c)-0x2400) +# return c +# for email, diagnosis in zip(open("t1"), open("t2")): +# email = email[:-1] # strip trailing \n but not more because trailing whitespace is significant +# email = "".join(fixup_char(c) for c in email).replace("&", "&") +# tests.append([email, diagnosis.strip()]) +# print(repr(tests).replace("'], ['", "'],\n['")) +@pytest.mark.parametrize( + ('email_input', 'status'), + [ + ['test', 'ISEMAIL_ERR_NODOMAIN'], + ['@', 'ISEMAIL_ERR_NOLOCALPART'], + ['test@', 'ISEMAIL_ERR_NODOMAIN'], + # ['test@io', 'ISEMAIL_VALID'], # we reject domains without a dot, knowing they are not deliverable + ['@io', 'ISEMAIL_ERR_NOLOCALPART'], + ['@iana.org', 'ISEMAIL_ERR_NOLOCALPART'], + ['test@iana.org', 'ISEMAIL_VALID'], + ['test@nominet.org.uk', 'ISEMAIL_VALID'], + ['test@about.museum', 'ISEMAIL_VALID'], + ['a@iana.org', 'ISEMAIL_VALID'], + ['test.test@iana.org', 'ISEMAIL_VALID'], + ['.test@iana.org', 'ISEMAIL_ERR_DOT_START'], + ['test.@iana.org', 'ISEMAIL_ERR_DOT_END'], + ['test..iana.org', 'ISEMAIL_ERR_CONSECUTIVEDOTS'], + ['test_exa-mple.com', 'ISEMAIL_ERR_NODOMAIN'], + ['!#$%&`*+/=?^`{|}~@iana.org', 'ISEMAIL_VALID'], + ['test\\@test@iana.org', 'ISEMAIL_ERR_EXPECTING_ATEXT'], + ['123@iana.org', 'ISEMAIL_VALID'], + ['test@123.com', 'ISEMAIL_VALID'], + ['abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghiklm@iana.org', 'ISEMAIL_VALID'], + ['abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghiklmn@iana.org', 'ISEMAIL_RFC5322_LOCAL_TOOLONG'], + ['test@abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghiklm.com', 'ISEMAIL_RFC5322_LABEL_TOOLONG'], + ['test@mason-dixon.com', 'ISEMAIL_VALID'], + ['test@-iana.org', 'ISEMAIL_ERR_DOMAINHYPHENSTART'], + ['test@iana-.com', 'ISEMAIL_ERR_DOMAINHYPHENEND'], + ['test@g--a.com', 'ISEMAIL_VALID'], + ['test@.iana.org', 'ISEMAIL_ERR_DOT_START'], + ['test@iana.org.', 'ISEMAIL_ERR_DOT_END'], + ['test@iana..com', 'ISEMAIL_ERR_CONSECUTIVEDOTS'], + ['abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghiklm@abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghikl.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghikl.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghij', 'ISEMAIL_RFC5322_TOOLONG'], + ['a@abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghikl.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghikl.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghikl.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefg.hij', 'ISEMAIL_RFC5322_TOOLONG'], + ['a@abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghikl.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghikl.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghikl.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefg.hijk', 'ISEMAIL_RFC5322_DOMAIN_TOOLONG'], + ['"test"@iana.org', 'ISEMAIL_RFC5321_QUOTEDSTRING'], + ['""@iana.org', 'ISEMAIL_RFC5321_QUOTEDSTRING'], + ['"""@iana.org', 'ISEMAIL_ERR_EXPECTING_ATEXT'], + ['"\\a"@iana.org', 'ISEMAIL_RFC5321_QUOTEDSTRING'], + ['"\\""@iana.org', 'ISEMAIL_RFC5321_QUOTEDSTRING'], + ['"\\"@iana.org', 'ISEMAIL_ERR_UNCLOSEDQUOTEDSTR'], + ['"\\\\"@iana.org', 'ISEMAIL_RFC5321_QUOTEDSTRING'], + ['test"@iana.org', 'ISEMAIL_ERR_EXPECTING_ATEXT'], + ['"test@iana.org', 'ISEMAIL_ERR_UNCLOSEDQUOTEDSTR'], + ['"test"test@iana.org', 'ISEMAIL_ERR_ATEXT_AFTER_QS'], + ['test"text"@iana.org', 'ISEMAIL_ERR_EXPECTING_ATEXT'], + ['"test""test"@iana.org', 'ISEMAIL_ERR_EXPECTING_ATEXT'], + ['"test"."test"@iana.org', 'ISEMAIL_DEPREC_LOCALPART'], + ['"test\\ test"@iana.org', 'ISEMAIL_RFC5321_QUOTEDSTRING'], + ['"test".test@iana.org', 'ISEMAIL_DEPREC_LOCALPART'], + ['"test\x00"@iana.org', 'ISEMAIL_ERR_EXPECTING_QTEXT'], + ['"test\\\x00"@iana.org', 'ISEMAIL_DEPREC_QP'], + ['"abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz abcdefghj"@iana.org', 'ISEMAIL_RFC5322_LOCAL_TOOLONG'], + ['"abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz abcdefg\\h"@iana.org', 'ISEMAIL_RFC5322_LOCAL_TOOLONG'], + ['test@[255.255.255.255]', 'ISEMAIL_RFC5321_ADDRESSLITERAL'], + ['test@a[255.255.255.255]', 'ISEMAIL_ERR_EXPECTING_ATEXT'], + ['test@[255.255.255]', 'ISEMAIL_RFC5322_DOMAINLITERAL'], + ['test@[255.255.255.255.255]', 'ISEMAIL_RFC5322_DOMAINLITERAL'], + ['test@[255.255.255.256]', 'ISEMAIL_RFC5322_DOMAINLITERAL'], + ['test@[1111:2222:3333:4444:5555:6666:7777:8888]', 'ISEMAIL_RFC5322_DOMAINLITERAL'], + ['test@[IPv6:1111:2222:3333:4444:5555:6666:7777]', 'ISEMAIL_RFC5322_IPV6_GRPCOUNT'], + ['test@[IPv6:1111:2222:3333:4444:5555:6666:7777:8888]', 'ISEMAIL_RFC5321_ADDRESSLITERAL'], + ['test@[IPv6:1111:2222:3333:4444:5555:6666:7777:8888:9999]', 'ISEMAIL_RFC5322_IPV6_GRPCOUNT'], + ['test@[IPv6:1111:2222:3333:4444:5555:6666:7777:888G]', 'ISEMAIL_RFC5322_IPV6_BADCHAR'], + ['test@[IPv6:1111:2222:3333:4444:5555:6666::8888]', 'ISEMAIL_RFC5321_IPV6DEPRECATED'], + ['test@[IPv6:1111:2222:3333:4444:5555::8888]', 'ISEMAIL_RFC5321_ADDRESSLITERAL'], + ['test@[IPv6:1111:2222:3333:4444:5555:6666::7777:8888]', 'ISEMAIL_RFC5322_IPV6_MAXGRPS'], + ['test@[IPv6::3333:4444:5555:6666:7777:8888]', 'ISEMAIL_RFC5322_IPV6_COLONSTRT'], + ['test@[IPv6:::3333:4444:5555:6666:7777:8888]', 'ISEMAIL_RFC5321_ADDRESSLITERAL'], + ['test@[IPv6:1111::4444:5555::8888]', 'ISEMAIL_RFC5322_IPV6_2X2XCOLON'], + ['test@[IPv6:::]', 'ISEMAIL_RFC5321_ADDRESSLITERAL'], + ['test@[IPv6:1111:2222:3333:4444:5555:255.255.255.255]', 'ISEMAIL_RFC5322_IPV6_GRPCOUNT'], + ['test@[IPv6:1111:2222:3333:4444:5555:6666:255.255.255.255]', 'ISEMAIL_RFC5321_ADDRESSLITERAL'], + ['test@[IPv6:1111:2222:3333:4444:5555:6666:7777:255.255.255.255]', 'ISEMAIL_RFC5322_IPV6_GRPCOUNT'], + ['test@[IPv6:1111:2222:3333:4444::255.255.255.255]', 'ISEMAIL_RFC5321_ADDRESSLITERAL'], + ['test@[IPv6:1111:2222:3333:4444:5555:6666::255.255.255.255]', 'ISEMAIL_RFC5322_IPV6_MAXGRPS'], + ['test@[IPv6:1111:2222:3333:4444:::255.255.255.255]', 'ISEMAIL_RFC5322_IPV6_2X2XCOLON'], + ['test@[IPv6::255.255.255.255]', 'ISEMAIL_RFC5322_IPV6_COLONSTRT'], + [' test @iana.org', 'ISEMAIL_DEPREC_CFWS_NEAR_AT'], + ['test@ iana .com', 'ISEMAIL_DEPREC_CFWS_NEAR_AT'], + ['test . test@iana.org', 'ISEMAIL_DEPREC_FWS'], + ['\r\n test@iana.org', 'ISEMAIL_CFWS_FWS'], + ['\r\n \r\n test@iana.org', 'ISEMAIL_DEPREC_FWS'], + ['(comment)test@iana.org', 'ISEMAIL_CFWS_COMMENT'], + ['((comment)test@iana.org', 'ISEMAIL_ERR_UNCLOSEDCOMMENT'], + ['(comment(comment))test@iana.org', 'ISEMAIL_CFWS_COMMENT'], + ['test@(comment)iana.org', 'ISEMAIL_DEPREC_CFWS_NEAR_AT'], + ['test(comment)test@iana.org', 'ISEMAIL_ERR_ATEXT_AFTER_CFWS'], + ['test@(comment)[255.255.255.255]', 'ISEMAIL_DEPREC_CFWS_NEAR_AT'], + ['(comment)abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghiklm@iana.org', 'ISEMAIL_CFWS_COMMENT'], + ['test@(comment)abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghikl.com', 'ISEMAIL_DEPREC_CFWS_NEAR_AT'], + ['(comment)test@abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghik.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghik.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijk.abcdefghijklmnopqrstuvwxyzabcdefghijk.abcdefghijklmnopqrstu', 'ISEMAIL_CFWS_COMMENT'], + ['test@iana.org\n', 'ISEMAIL_ERR_EXPECTING_ATEXT'], + ['test@xn--hxajbheg2az3al.xn--jxalpdlp', 'ISEMAIL_VALID'], + ['xn--test@iana.org', 'ISEMAIL_VALID'], + ['test@iana.org-', 'ISEMAIL_ERR_DOMAINHYPHENEND'], + ['"test@iana.org', 'ISEMAIL_ERR_UNCLOSEDQUOTEDSTR'], + ['(test@iana.org', 'ISEMAIL_ERR_UNCLOSEDCOMMENT'], + ['test@(iana.org', 'ISEMAIL_ERR_UNCLOSEDCOMMENT'], + ['test@[1.2.3.4', 'ISEMAIL_ERR_UNCLOSEDDOMLIT'], + ['"test\\"@iana.org', 'ISEMAIL_ERR_UNCLOSEDQUOTEDSTR'], + ['(comment\\)test@iana.org', 'ISEMAIL_ERR_UNCLOSEDCOMMENT'], + ['test@iana.org(comment\\)', 'ISEMAIL_ERR_UNCLOSEDCOMMENT'], + ['test@iana.org(comment\\', 'ISEMAIL_ERR_BACKSLASHEND'], + ['test@[RFC-5322-domain-literal]', 'ISEMAIL_RFC5322_DOMAINLITERAL'], + ['test@[RFC-5322]-domain-literal]', 'ISEMAIL_ERR_ATEXT_AFTER_DOMLIT'], + ['test@[RFC-5322-[domain-literal]', 'ISEMAIL_ERR_EXPECTING_DTEXT'], + ['test@[RFC-5322-\\\x07-domain-literal]', 'ISEMAIL_RFC5322_DOMLIT_OBSDTEXT'], + ['test@[RFC-5322-\\\t-domain-literal]', 'ISEMAIL_RFC5322_DOMLIT_OBSDTEXT'], + ['test@[RFC-5322-\\]-domain-literal]', 'ISEMAIL_RFC5322_DOMLIT_OBSDTEXT'], + ['test@[RFC-5322-domain-literal\\]', 'ISEMAIL_ERR_UNCLOSEDDOMLIT'], + ['test@[RFC-5322-domain-literal\\', 'ISEMAIL_ERR_BACKSLASHEND'], + ['test@[RFC 5322 domain literal]', 'ISEMAIL_RFC5322_DOMAINLITERAL'], + ['test@[RFC-5322-domain-literal] (comment)', 'ISEMAIL_RFC5322_DOMAINLITERAL'], + ['\x7f@iana.org', 'ISEMAIL_ERR_EXPECTING_ATEXT'], + ['test@\x7f.org', 'ISEMAIL_ERR_EXPECTING_ATEXT'], + ['"\x7f"@iana.org', 'ISEMAIL_DEPREC_QTEXT'], + ['"\\\x7f"@iana.org', 'ISEMAIL_DEPREC_QP'], + ['(\x7f)test@iana.org', 'ISEMAIL_DEPREC_CTEXT'], + ['test@iana.org\r', 'ISEMAIL_ERR_CR_NO_LF'], + ['\rtest@iana.org', 'ISEMAIL_ERR_CR_NO_LF'], + ['"\rtest"@iana.org', 'ISEMAIL_ERR_CR_NO_LF'], + ['(\r)test@iana.org', 'ISEMAIL_ERR_CR_NO_LF'], + ['test@iana.org(\r)', 'ISEMAIL_ERR_CR_NO_LF'], + ['\ntest@iana.org', 'ISEMAIL_ERR_EXPECTING_ATEXT'], + ['"\n"@iana.org', 'ISEMAIL_ERR_EXPECTING_QTEXT'], + ['"\\\n"@iana.org', 'ISEMAIL_DEPREC_QP'], + ['(\n)test@iana.org', 'ISEMAIL_ERR_EXPECTING_CTEXT'], + ['\x07@iana.org', 'ISEMAIL_ERR_EXPECTING_ATEXT'], + ['test@\x07.org', 'ISEMAIL_ERR_EXPECTING_ATEXT'], + ['"\x07"@iana.org', 'ISEMAIL_DEPREC_QTEXT'], + ['"\\\x07"@iana.org', 'ISEMAIL_DEPREC_QP'], + ['(\x07)test@iana.org', 'ISEMAIL_DEPREC_CTEXT'], + ['\r\ntest@iana.org', 'ISEMAIL_ERR_FWS_CRLF_END'], + ['\r\n \r\ntest@iana.org', 'ISEMAIL_ERR_FWS_CRLF_END'], + [' \r\ntest@iana.org', 'ISEMAIL_ERR_FWS_CRLF_END'], + [' \r\n test@iana.org', 'ISEMAIL_CFWS_FWS'], + [' \r\n \r\ntest@iana.org', 'ISEMAIL_ERR_FWS_CRLF_END'], + [' \r\n\r\ntest@iana.org', 'ISEMAIL_ERR_FWS_CRLF_X2'], + [' \r\n\r\n test@iana.org', 'ISEMAIL_ERR_FWS_CRLF_X2'], + ['test@iana.org\r\n ', 'ISEMAIL_CFWS_FWS'], + ['test@iana.org\r\n \r\n ', 'ISEMAIL_DEPREC_FWS'], + ['test@iana.org\r\n', 'ISEMAIL_ERR_FWS_CRLF_END'], + ['test@iana.org\r\n \r\n', 'ISEMAIL_ERR_FWS_CRLF_END'], + ['test@iana.org \r\n', 'ISEMAIL_ERR_FWS_CRLF_END'], + ['test@iana.org \r\n ', 'ISEMAIL_CFWS_FWS'], + ['test@iana.org \r\n \r\n', 'ISEMAIL_ERR_FWS_CRLF_END'], + ['test@iana.org \r\n\r\n', 'ISEMAIL_ERR_FWS_CRLF_X2'], + ['test@iana.org \r\n\r\n ', 'ISEMAIL_ERR_FWS_CRLF_X2'], + [' test@iana.org', 'ISEMAIL_CFWS_FWS'], + ['test@iana.org ', 'ISEMAIL_CFWS_FWS'], + ['test@[IPv6:1::2:]', 'ISEMAIL_RFC5322_IPV6_COLONEND'], + ['"test\\©"@iana.org', 'ISEMAIL_ERR_EXPECTING_QPAIR'], + ['test@iana/icann.org', 'ISEMAIL_RFC5322_DOMAIN'], + ['test.(comment)test@iana.org', 'ISEMAIL_DEPREC_COMMENT'] + ] +) +def test_pyisemail_tests(email_input, status): + if status == "ISEMAIL_VALID": + # All standard email address forms should not raise an exception. + validate_email(email_input, test_environment=True) + elif "_ERR_" in status or "_TOOLONG" in status \ + or "_CFWS_FWS" in status or "_CFWS_COMMENT" in status \ + or "_IPV6" in status or status == "ISEMAIL_RFC5322_DOMAIN": + # Invalid syntax, extranous whitespace, and "(comments)" should be rejected. + # The _IPV6_ diagnoses appear to represent syntactically invalid domain literals. + # The ISEMAIL_RFC5322_DOMAIN diagnosis appears to be a syntactically invalid domain. + with pytest.raises(EmailSyntaxError): + validate_email(email_input, test_environment=True) + elif "_DEPREC_" in status \ + or "RFC5321_QUOTEDSTRING" in status \ + or "DOMAINLITERAL" in status or "_DOMLIT_" in status or "_ADDRESSLITERAL" in status: + # Quoted strings in the local part, domain literals (IP addresses in brackets), + # and other deprecated syntax are valid email addresses and are accepted by pyIsEmail, + # but we reject them. + with pytest.raises(EmailSyntaxError): + validate_email(email_input, test_environment=True) + else: + raise ValueError("status {} is not recognized".format(status)) From d40f70881e621cee88e1acd1272db6a215656cf6 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Fri, 27 Jan 2023 11:42:35 -0500 Subject: [PATCH 066/174] In tests replace live DNS queries with mocked DNS answers that have been captured to a JSON file --- README.md | 2 + email_validator/__main__.py | 8 +- email_validator/deliverability.py | 6 -- tests/mocked-dns-answers.json | 100 +++++++++++++++++++++++++ tests/mocked_dns_response.py | 120 ++++++++++++++++++++++++++++++ tests/test_deliverability.py | 53 +++---------- tests/test_main.py | 12 ++- 7 files changed, 245 insertions(+), 56 deletions(-) create mode 100644 tests/mocked-dns-answers.json create mode 100644 tests/mocked_dns_response.py diff --git a/README.md b/README.md index 55017ad..a483bc8 100644 --- a/README.md +++ b/README.md @@ -416,6 +416,8 @@ pip install -r test_requirements.txt make test ``` +Tests run with mocked DNS responses. When adding or changing tests, temporarily turn on the `BUILD_MOCKED_DNS_RESPONSE_DATA` flag in `tests/mocked_dns_responses.py` to re-build the database of mocked responses from live queries. + For Project Maintainers ----------------------- diff --git a/email_validator/__main__.py b/email_validator/__main__.py index a2e69fe..4684f6a 100644 --- a/email_validator/__main__.py +++ b/email_validator/__main__.py @@ -17,10 +17,12 @@ from .exceptions_types import EmailNotValidError -def main(): +def main(dns_resolver=None): + # The dns_resolver argument is for tests. + if len(sys.argv) == 1: # Validate the email addresses pased line-by-line on STDIN. - dns_resolver = caching_resolver() + dns_resolver = dns_resolver or caching_resolver() for line in sys.stdin: email = line.strip() try: @@ -31,7 +33,7 @@ def main(): # Validate the email address passed on the command line. email = sys.argv[1] try: - result = validate_email(email) + result = validate_email(email, dns_resolver=dns_resolver) print(json.dumps(result.as_dict(), indent=2, sort_keys=True, ensure_ascii=False)) except EmailNotValidError as e: print(e) diff --git a/email_validator/deliverability.py b/email_validator/deliverability.py index bd1a6c4..a7c3752 100644 --- a/email_validator/deliverability.py +++ b/email_validator/deliverability.py @@ -33,12 +33,6 @@ def validate_email_deliverability(domain, domain_i18n, timeout=None, dns_resolve deliverability_info = {} try: - # We need a way to check how timeouts are handled in the tests. So we - # have a secret variable that if set makes this method always test the - # handling of a timeout. - if getattr(validate_email_deliverability, 'TEST_CHECK_TIMEOUT', False): - raise dns.exception.Timeout() - try: # Try resolving for MX records. response = dns_resolver.resolve(domain, "MX") diff --git a/tests/mocked-dns-answers.json b/tests/mocked-dns-answers.json new file mode 100644 index 0000000..bc5465b --- /dev/null +++ b/tests/mocked-dns-answers.json @@ -0,0 +1,100 @@ +[ + { + "query": { + "name": "gmail.com", + "type": "MX", + "class": "IN" + }, + "answer": [ + "10 alt1.gmail-smtp-in.l.google.com.", + "30 alt3.gmail-smtp-in.l.google.com.", + "5 gmail-smtp-in.l.google.com.", + "20 alt2.gmail-smtp-in.l.google.com.", + "40 alt4.gmail-smtp-in.l.google.com." + ] + }, + { + "query": { + "name": "xkxufoekjvjfjeodlfmdfjcu.com", + "type": "ANY", + "class": "IN" + }, + "answer": [] + }, + { + "query": { + "name": "xkxufoekjvjfjeodlfmdfjcu.com", + "type": "AAAA", + "class": "IN" + }, + "answer": [] + }, + { + "query": { + "name": "example.com", + "type": "MX", + "class": "IN" + }, + "answer": [ + "0 ." + ] + }, + { + "query": { + "name": "mail.example", + "type": "MX", + "class": "IN" + }, + "answer": [] + }, + { + "query": { + "name": "mail.example", + "type": "ANY", + "class": "IN" + }, + "answer": [] + }, + { + "query": { + "name": "mail.example", + "type": "AAAA", + "class": "IN" + }, + "answer": [] + }, + { + "query": { + "name": "mail.example.com", + "type": "MX", + "class": "IN" + }, + "answer": [] + }, + { + "query": { + "name": "mail.example.com", + "type": "ANY", + "class": "IN" + }, + "answer": [] + }, + { + "query": { + "name": "mail.example.com", + "type": "AAAA", + "class": "IN" + }, + "answer": [] + }, + { + "query": { + "name": "google.com", + "type": "MX", + "class": "IN" + }, + "answer": [ + "10 smtp.google.com." + ] + } +] \ No newline at end of file diff --git a/tests/mocked_dns_response.py b/tests/mocked_dns_response.py new file mode 100644 index 0000000..2ffabf6 --- /dev/null +++ b/tests/mocked_dns_response.py @@ -0,0 +1,120 @@ +import dns.resolver +import json +import os.path +import pytest + +from email_validator.deliverability import caching_resolver + +# To run deliverability checks without actually making +# DNS queries, we use a caching resolver where the cache +# is pre-loaded with DNS responses. + +# When False, all DNS queries must come from the mocked +# data. When True, tests are run with live DNS queries +# and the DNS responses are saved to a file. +BUILD_MOCKED_DNS_RESPONSE_DATA = False + + +# This class implements the 'get' and 'put' methods +# expected for a dns.resolver.Resolver's cache. +class MockedDnsResponseData: + DATA_PATH = os.path.dirname(__file__) + "/mocked-dns-answers.json" + + @staticmethod + def create_resolver(): + if not hasattr(MockedDnsResponseData, 'INSTANCE'): + # Create a singleton instance of this class and load the saved DNS responses. + # Except when BUILD_MOCKED_DNS_RESPONSE_DATA is true, don't load the data. + singleton = MockedDnsResponseData() + if not BUILD_MOCKED_DNS_RESPONSE_DATA: + singleton.load() + MockedDnsResponseData.INSTANCE = singleton + + # Return a new dns.resolver.Resolver configured for caching + # using the singleton instance. + return caching_resolver(cache=MockedDnsResponseData.INSTANCE) + + def __init__(self): + self.data = {} + + def load(self): + # Loads the saved DNS response data from the JSON file and + # re-structures it into dnspython classes. + class Ans: # mocks the dns.resolver.Answer class + + def __init__(self, rrset): + self.rrset = rrset + + def __iter__(self): + return iter(self.rrset) + + with open(self.DATA_PATH) as f: + data = json.load(f) + for item in data: + key = (dns.name.from_text(item["query"]["name"] + "."), + dns.rdatatype.from_text(item["query"]["type"]), + dns.rdataclass.from_text(item["query"]["class"])) + rdatas = [ + dns.rdata.from_text(rdtype=key[1], rdclass=key[2], tok=rr) + for rr in item["answer"] + ] + if item["answer"]: + self.data[key] = Ans(dns.rdataset.from_rdata_list(0, rdatas=rdatas)) + else: + self.data[key] = None + + def save(self): + # Re-structure as a list with basic data types. + data = [ + { + "query": { + "name": key[0].to_text(omit_final_dot=True), + "type": dns.rdatatype.to_text(key[1]), + "class": dns.rdataclass.to_text(key[2]), + }, + "answer": [ + rr.to_text() + for rr in value + ] + } + for key, value in self.data.items() + ] + with open(self.DATA_PATH, "w") as f: + json.dump(data, f, indent=True) + + def get(self, key): + # Special-case a domain to create a timeout. + if key[0].to_text() == "timeout.com.": + raise dns.exception.Timeout() + + # When building the DNS response database, return + # a cache miss. + if BUILD_MOCKED_DNS_RESPONSE_DATA: + return None + + # Query the data for a matching record. + if key in self.data: + if not self.data[key]: + raise dns.resolver.NoAnswer() + return self.data[key] + + # Query the data for a response to an ANY query. + ANY = dns.rdatatype.from_text("ANY") + if (key[0], ANY, key[2]) in self.data and self.data[(key[0], ANY, key[2])] is None: + raise dns.resolver.NoAnswer() + + raise ValueError("Saved DNS data did not contain query: {}".format(key)) + + def put(self, key, value): + # Build the DNS data by saving the live query response. + if not BUILD_MOCKED_DNS_RESPONSE_DATA: + raise ValueError("Should not get here.") + self.data[key] = value + + +@pytest.fixture(scope="session", autouse=True) +def MockedDnsResponseDataCleanup(request): + def cleanup_func(): + if BUILD_MOCKED_DNS_RESPONSE_DATA: + MockedDnsResponseData.INSTANCE.save() + request.addfinalizer(cleanup_func) diff --git a/tests/test_deliverability.py b/tests/test_deliverability.py index e8a7cce..5cb20e9 100644 --- a/tests/test_deliverability.py +++ b/tests/test_deliverability.py @@ -1,14 +1,17 @@ -import dns.resolver import pytest import re from email_validator import EmailUndeliverableError, \ validate_email -from email_validator.deliverability import caching_resolver, validate_email_deliverability +from email_validator.deliverability import validate_email_deliverability + +from mocked_dns_response import MockedDnsResponseData, MockedDnsResponseDataCleanup # noqa: F401 + +RESOLVER = MockedDnsResponseData.create_resolver() def test_deliverability_found(): - response = validate_email_deliverability('gmail.com', 'gmail.com') + response = validate_email_deliverability('gmail.com', 'gmail.com', dns_resolver=RESOLVER) assert response.keys() == {'mx', 'mx_fallback_type'} assert response['mx_fallback_type'] is None assert len(response['mx']) > 1 @@ -21,12 +24,12 @@ def test_deliverability_fails(): # No MX record. domain = 'xkxufoekjvjfjeodlfmdfjcu.com' with pytest.raises(EmailUndeliverableError, match='The domain name {} does not exist'.format(domain)): - validate_email_deliverability(domain, domain) + validate_email_deliverability(domain, domain, dns_resolver=RESOLVER) # Null MX record. domain = 'example.com' with pytest.raises(EmailUndeliverableError, match='The domain name {} does not accept email'.format(domain)): - validate_email_deliverability(domain, domain) + validate_email_deliverability(domain, domain, dns_resolver=RESOLVER) @pytest.mark.parametrize( @@ -41,48 +44,12 @@ def test_email_example_reserved_domain(email_input): # Since these all fail deliverabiltiy from a static list, # DNS deliverability checks do not arise. with pytest.raises(EmailUndeliverableError) as exc_info: - validate_email(email_input) + validate_email(email_input, dns_resolver=RESOLVER) # print(f'({email_input!r}, {str(exc_info.value)!r}),') assert re.match(r"The domain name [a-z\.]+ does not (accept email|exist)\.", str(exc_info.value)) is not None def test_deliverability_dns_timeout(): - validate_email_deliverability.TEST_CHECK_TIMEOUT = True - response = validate_email_deliverability('gmail.com', 'gmail.com') + response = validate_email_deliverability('timeout.com', 'timeout.com', dns_resolver=RESOLVER) assert "mx" not in response assert response.get("unknown-deliverability") == "timeout" - validate_email('test@gmail.com') - del validate_email_deliverability.TEST_CHECK_TIMEOUT - - -def test_validate_email__with_caching_resolver(): - # unittest.mock.patch("dns.resolver.LRUCache.get") doesn't - # work --- it causes get to always return an empty list. - # So we'll mock our own way. - class MockedCache: - get_called = False - put_called = False - - def get(self, key): - self.get_called = True - return None - - def put(self, key, value): - self.put_called = True - - # Test with caching_resolver helper method. - mocked_cache = MockedCache() - dns_resolver = caching_resolver(cache=mocked_cache) - validate_email("test@gmail.com", dns_resolver=dns_resolver) - assert mocked_cache.put_called - validate_email("test@gmail.com", dns_resolver=dns_resolver) - assert mocked_cache.get_called - - # Test with dns.resolver.Resolver instance. - dns_resolver = dns.resolver.Resolver() - dns_resolver.lifetime = 10 - dns_resolver.cache = MockedCache() - validate_email("test@gmail.com", dns_resolver=dns_resolver) - assert mocked_cache.put_called - validate_email("test@gmail.com", dns_resolver=dns_resolver) - assert mocked_cache.get_called diff --git a/tests/test_main.py b/tests/test_main.py index e087e2d..6676ff6 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -2,6 +2,10 @@ # Let's test main but rename it to be clear from email_validator.__main__ import main as validator_command_line_tool +from mocked_dns_response import MockedDnsResponseData, MockedDnsResponseDataCleanup # noqa: F401 + +RESOLVER = MockedDnsResponseData.create_resolver() + def test_dict_accessor(): input_email = "testaddr@example.tld" @@ -14,17 +18,17 @@ def test_main_single_good_input(monkeypatch, capsys): import json test_email = "google@google.com" monkeypatch.setattr('sys.argv', ['email_validator', test_email]) - validator_command_line_tool() + validator_command_line_tool(dns_resolver=RESOLVER) stdout, _ = capsys.readouterr() output = json.loads(str(stdout)) assert isinstance(output, dict) - assert validate_email(test_email).original_email == output["original_email"] + assert validate_email(test_email, dns_resolver=RESOLVER).original_email == output["original_email"] def test_main_single_bad_input(monkeypatch, capsys): bad_email = 'test@..com' monkeypatch.setattr('sys.argv', ['email_validator', bad_email]) - validator_command_line_tool() + validator_command_line_tool(dns_resolver=RESOLVER) stdout, _ = capsys.readouterr() assert stdout == 'An email address cannot have a period immediately after the @-sign.\n' @@ -35,7 +39,7 @@ def test_main_multi_input(monkeypatch, capsys): test_input = io.StringIO("\n".join(test_cases)) monkeypatch.setattr('sys.stdin', test_input) monkeypatch.setattr('sys.argv', ['email_validator']) - validator_command_line_tool() + validator_command_line_tool(dns_resolver=RESOLVER) stdout, _ = capsys.readouterr() assert test_cases[0] not in stdout assert test_cases[1] not in stdout From 52f2ab7c54aadb114c6b322c042d1d80cafb194e Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sat, 4 Feb 2023 14:21:53 -0500 Subject: [PATCH 067/174] Improve code coverage --- tests/test_deliverability.py | 23 ++++++++++++++++++++++- tests/test_main.py | 15 ++++++++++++++- tests/test_syntax.py | 8 ++++++++ 3 files changed, 44 insertions(+), 2 deletions(-) diff --git a/tests/test_deliverability.py b/tests/test_deliverability.py index 5cb20e9..e1b3dbc 100644 --- a/tests/test_deliverability.py +++ b/tests/test_deliverability.py @@ -2,7 +2,7 @@ import re from email_validator import EmailUndeliverableError, \ - validate_email + validate_email, caching_resolver from email_validator.deliverability import validate_email_deliverability from mocked_dns_response import MockedDnsResponseData, MockedDnsResponseDataCleanup # noqa: F401 @@ -53,3 +53,24 @@ def test_deliverability_dns_timeout(): response = validate_email_deliverability('timeout.com', 'timeout.com', dns_resolver=RESOLVER) assert "mx" not in response assert response.get("unknown-deliverability") == "timeout" + + +@pytest.mark.network +def test_caching_dns_resolver(): + class TestCache: + def __init__(self): + self.cache = {} + + def get(self, key): + return self.cache.get(key) + + def put(self, key, value): + self.cache[key] = value + + cache = TestCache() + resolver = caching_resolver(timeout=1, cache=cache) + validate_email("test@gmail.com", dns_resolver=resolver) + assert len(cache.cache) == 1 + + validate_email("test@gmail.com", dns_resolver=resolver) + assert len(cache.cache) == 1 diff --git a/tests/test_main.py b/tests/test_main.py index 6676ff6..34005da 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -1,4 +1,6 @@ -from email_validator import validate_email +import pytest + +from email_validator import validate_email, EmailSyntaxError # Let's test main but rename it to be clear from email_validator.__main__ import main as validator_command_line_tool @@ -45,3 +47,14 @@ def test_main_multi_input(monkeypatch, capsys): assert test_cases[1] not in stdout assert test_cases[2] in stdout assert test_cases[3] in stdout + + +def test_bytes_input(): + input_email = b"testaddr@example.tld" + valid_email = validate_email(input_email, check_deliverability=False) + assert isinstance(valid_email.as_dict(), dict) + assert valid_email.as_dict()["email"] == input_email.decode("utf8") + + input_email = "testaddr中example.tld".encode("utf32") + with pytest.raises(EmailSyntaxError): + validate_email(input_email, check_deliverability=False) diff --git a/tests/test_syntax.py b/tests/test_syntax.py index 41042d5..18280b7 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -239,11 +239,18 @@ def test_email_valid(email_input, output): ('111111111122222222223333333333444444444455555555556666666666777777@example.com', 'The email address is too long before the @-sign (2 characters too many).'), ('me@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.111111111122222222223333333333444444444455555555556.com', 'The email address is too long (4 characters too many).'), ('me@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555566.com', 'The email address is too long after the @-sign (1 character too many).'), + ('me@中1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555566.com', 'The email address is too long after the @-sign.'), ('my.long.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333333344444.info', 'The email address is too long (2 characters too many).'), ('my.long.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333.info', 'The email address is too long (when converted to IDNA ASCII).'), ('my.long.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (at least 1 character too many).'), ('my.λong.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.111111111122222222223333333333444.info', 'The email address is too long (when encoded in bytes).'), ('my.λong.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (at least 1 character too many).'), + ('me@bad-tld-1', 'The part after the @-sign is not valid. It should have a period.'), + ('me@bad.tld-2', 'The part after the @-sign is not valid. It is not within a valid top-level domain.'), + ('me@-', 'The part after the @-sign contains invalid characters (Label must not start or end with a hyphen).'), + ('me@x!', 'The part after the @-sign contains invalid characters (Codepoint U+0021 at position 2 of \'x!\' not allowed).'), + ('me@xn--', 'The part after the @-sign contains invalid characters (Malformed A-label, no Punycode eligible content found).'), + ('me@yy--', 'The part after the @-sign contains invalid characters (Label has disallowed hyphens in 3rd and 4th position).'), ], ) def test_email_invalid_syntax(email_input, error_msg): @@ -287,6 +294,7 @@ def test_email_invalid_reserved_domain(email_input): ('\uD800@test'), # surrogate (Cs) ('\uE000@test'), # private use (Co) ('\uFDEF@test'), # unassigned (Cn) + ('\u0300@test'), # grave accent (M) ], ) def test_email_unsafe_character(email_input): From 4f69b4254cfa35f5f27659bc9fc0c1bba130ddb0 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sat, 4 Feb 2023 16:29:46 -0500 Subject: [PATCH 068/174] Don't rely on the IDNA library for checking valid hyphens, use our own checks Including invalid RFC 5890 R-LDH labels (e.g. '??--' other than 'xn--'), see #92. The IDNA library will check this but its error messages are not friendly, and for future proofing it's better to not assume it does any general syntax checks. --- email_validator/syntax.py | 19 +++++++++++++++---- tests/test_syntax.py | 28 +++++++++++++++++----------- 2 files changed, 32 insertions(+), 15 deletions(-) diff --git a/email_validator/syntax.py b/email_validator/syntax.py index 3c34e56..e58aadf 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -130,16 +130,27 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera except idna.IDNAError as e: raise EmailSyntaxError("The part after the @-sign contains invalid characters ({}).".format(str(e))) - # Now we can perform basic checks on the use of periods (since equivalent - # symbols have been mapped to periods). These checks are needed because the - # IDNA library doesn't handle well domains that have empty labels (i.e. initial - # dot, trailing dot, or two dots in a row). + # The domain part is made up period-separated "labels." Each label must + # have at least one character and cannot start or end with dashes, which + # means there are some surprising restrictions on periods and dashes. + # Check that before we do IDNA encoding because the IDNA library gives + # unfriendly errors for these cases, but after UTS-46 normalization because + # it can insert periods and hyphens (from fullwidth characters). if domain.endswith("."): raise EmailSyntaxError("An email address cannot end with a period.") if domain.startswith("."): raise EmailSyntaxError("An email address cannot have a period immediately after the @-sign.") if ".." in domain: raise EmailSyntaxError("An email address cannot have two periods in a row.") + if domain.endswith("-"): + raise EmailSyntaxError("An email address cannot end with a hyphen.") + if domain.startswith("-"): + raise EmailSyntaxError("An email address cannot have a hyphen immediately after the @-sign.") + if ".-" in domain or "-." in domain: + raise EmailSyntaxError("An email address cannot have a period and a hyphen next to each other.") + for label in domain.split("."): + if re.match(r"(?!xn)..--", label, re.I): # RFC 5890 invalid R-LDH labels + raise EmailSyntaxError("An email address cannot have two letters followed by two dashes immediately after the @-sign or after a period, except Punycode.") if DOT_ATOM_TEXT_HOSTNAME.match(domain): ascii_domain = domain diff --git a/tests/test_syntax.py b/tests/test_syntax.py index 18280b7..0bd01d6 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -210,15 +210,21 @@ def test_email_valid(email_input, output): [ ('my@localhost', 'The part after the @-sign is not valid. It should have a period.'), ('my@.leadingdot.com', 'An email address cannot have a period immediately after the @-sign.'), - ('my@..leadingfwdot.com', 'An email address cannot have a period immediately after the @-sign.'), - ('my@..twodots.com', 'An email address cannot have a period immediately after the @-sign.'), + ('my@.leadingfwdot.com', 'An email address cannot have a period immediately after the @-sign.'), ('my@twodots..com', 'An email address cannot have two periods in a row.'), - ('my@baddash.-.com', - 'The part after the @-sign contains invalid characters (Label must not start or end with a hyphen).'), - ('my@baddash.-a.com', - 'The part after the @-sign contains invalid characters (Label must not start or end with a hyphen).'), - ('my@baddash.b-.com', - 'The part after the @-sign contains invalid characters (Label must not start or end with a hyphen).'), + ('my@twofwdots...com', 'An email address cannot have two periods in a row.'), + ('my@trailingdot.com.', 'An email address cannot end with a period.'), + ('my@trailingfwdot.com.', 'An email address cannot end with a period.'), + ('me@-leadingdash', 'An email address cannot have a hyphen immediately after the @-sign.'), + ('me@-leadingdashfw', 'An email address cannot have a hyphen immediately after the @-sign.'), + ('me@trailingdash-', 'An email address cannot end with a hyphen.'), + ('me@trailingdashfw-', 'An email address cannot end with a hyphen.'), + ('my@baddash.-.com', 'An email address cannot have a period and a hyphen next to each other.'), + ('my@baddash.-a.com', 'An email address cannot have a period and a hyphen next to each other.'), + ('my@baddash.b-.com', 'An email address cannot have a period and a hyphen next to each other.'), + ('my@baddashfw.-.com', 'An email address cannot have a period and a hyphen next to each other.'), + ('my@baddashfw.-a.com', 'An email address cannot have a period and a hyphen next to each other.'), + ('my@baddashfw.b-.com', 'An email address cannot have a period and a hyphen next to each other.'), ('my@example.com\n', 'The part after the @-sign contains invalid characters (Codepoint U+000A at position 4 of ' '\'com\\n\' not allowed).'), @@ -247,10 +253,10 @@ def test_email_valid(email_input, output): ('my.λong.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (at least 1 character too many).'), ('me@bad-tld-1', 'The part after the @-sign is not valid. It should have a period.'), ('me@bad.tld-2', 'The part after the @-sign is not valid. It is not within a valid top-level domain.'), - ('me@-', 'The part after the @-sign contains invalid characters (Label must not start or end with a hyphen).'), ('me@x!', 'The part after the @-sign contains invalid characters (Codepoint U+0021 at position 2 of \'x!\' not allowed).'), - ('me@xn--', 'The part after the @-sign contains invalid characters (Malformed A-label, no Punycode eligible content found).'), - ('me@yy--', 'The part after the @-sign contains invalid characters (Label has disallowed hyphens in 3rd and 4th position).'), + ('me@xn--0.tld', 'The part after the @-sign is not valid IDNA (Invalid A-label).'), + ('me@yy--0.tld', 'An email address cannot have two letters followed by two dashes immediately after the @-sign or after a period, except Punycode.'), + ('me@yy--0.tld', 'An email address cannot have two letters followed by two dashes immediately after the @-sign or after a period, except Punycode.'), ], ) def test_email_invalid_syntax(email_input, error_msg): From 0f0b4a45fbb07e8a2e0d2f8c8e3fbc3f9feb2d69 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sat, 4 Feb 2023 18:01:49 -0500 Subject: [PATCH 069/174] Add a test for a domain without a MX record, with an A record, but with a reject-all SPF record --- tests/mocked-dns-answers.json | 31 ++++++++++++++++++++++++++----- tests/test_deliverability.py | 5 +++++ 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/tests/mocked-dns-answers.json b/tests/mocked-dns-answers.json index bc5465b..39c95a1 100644 --- a/tests/mocked-dns-answers.json +++ b/tests/mocked-dns-answers.json @@ -41,16 +41,37 @@ }, { "query": { - "name": "mail.example", + "name": "nellis.af.mil", "type": "MX", "class": "IN" }, "answer": [] }, + { + "query": { + "name": "nellis.af.mil", + "type": "A", + "class": "IN" + }, + "answer": [ + "132.58.234.0" + ] + }, + { + "query": { + "name": "nellis.af.mil", + "type": "TXT", + "class": "IN" + }, + "answer": [ + "\"v=spf1 -all\"", + "\"MS=ms47108184\"" + ] + }, { "query": { "name": "mail.example", - "type": "ANY", + "type": "MX", "class": "IN" }, "answer": [] @@ -58,15 +79,15 @@ { "query": { "name": "mail.example", - "type": "AAAA", + "type": "ANY", "class": "IN" }, "answer": [] }, { "query": { - "name": "mail.example.com", - "type": "MX", + "name": "mail.example", + "type": "AAAA", "class": "IN" }, "answer": [] diff --git a/tests/test_deliverability.py b/tests/test_deliverability.py index e1b3dbc..46ec3d8 100644 --- a/tests/test_deliverability.py +++ b/tests/test_deliverability.py @@ -31,6 +31,11 @@ def test_deliverability_fails(): with pytest.raises(EmailUndeliverableError, match='The domain name {} does not accept email'.format(domain)): validate_email_deliverability(domain, domain, dns_resolver=RESOLVER) + # No MX record, A record fallback, reject-all SPF record. + domain = 'nellis.af.mil' + with pytest.raises(EmailUndeliverableError, match='The domain name {} does not send email'.format(domain)): + validate_email_deliverability(domain, domain, dns_resolver=RESOLVER) + @pytest.mark.parametrize( 'email_input', From 68fb9f2e33dfa30442b295d8406ad0cc3f034c5b Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sat, 4 Feb 2023 18:12:18 -0500 Subject: [PATCH 070/174] Handle NoNameservers and NXDOMAIN exceptions better I am not sure what NoNameservers means, so I think it might be that no local nameservers could respond. Local error conditions should not fail deliverability (same as a timeout). NXDOMAIN means no records are present for a domain, so a NXDOMAIN after the MX query can skip straight to rejecting the domain, rather than going to the A/AAAA fallback. Consequently, the error for a missing fallback can be changed from "does not exist" (which is now handled by the NXDOMAIN except block) to "does not accept email." --- email_validator/deliverability.py | 31 +++++++++++++++++++++---------- tests/mocked-dns-answers.json | 16 ++++------------ tests/mocked_dns_response.py | 2 +- 3 files changed, 26 insertions(+), 23 deletions(-) diff --git a/email_validator/deliverability.py b/email_validator/deliverability.py index a7c3752..9616afc 100644 --- a/email_validator/deliverability.py +++ b/email_validator/deliverability.py @@ -45,31 +45,33 @@ def validate_email_deliverability(domain, domain_i18n, timeout=None, dns_resolve # email is not deliverable. mtas = [(preference, exchange) for preference, exchange in mtas if exchange != ""] - if len(mtas) == 0: + if len(mtas) == 0: # null MX only, if there were no MX records originally a NoAnswer exception would have occurred raise EmailUndeliverableError("The domain name %s does not accept email." % domain_i18n) deliverability_info["mx"] = mtas deliverability_info["mx_fallback_type"] = None - except (dns.resolver.NoNameservers, dns.resolver.NXDOMAIN, dns.resolver.NoAnswer): - + except dns.resolver.NoAnswer: # If there was no MX record, fall back to an A record, as SMTP servers do. try: response = dns_resolver.resolve(domain, "A") deliverability_info["mx"] = [(0, str(r)) for r in response] deliverability_info["mx_fallback_type"] = "A" - except (dns.resolver.NoNameservers, dns.resolver.NXDOMAIN, dns.resolver.NoAnswer): + + except dns.resolver.NoAnswer: # If there was no A record, fall back to an AAAA record. try: response = dns_resolver.resolve(domain, "AAAA") deliverability_info["mx"] = [(0, str(r)) for r in response] deliverability_info["mx_fallback_type"] = "AAAA" - except (dns.resolver.NoNameservers, dns.resolver.NXDOMAIN, dns.resolver.NoAnswer): + except dns.resolver.NoAnswer: # If there was no MX, A, or AAAA record, then mail to - # this domain is not deliverable. - raise EmailUndeliverableError("The domain name %s does not exist." % domain_i18n) + # this domain is not deliverable, although the domain + # name has other records (otherwise NXDOMAIN would + # have been raised). + raise EmailUndeliverableError("The domain name %s does not accept email." % domain_i18n) # Check for a SPF reject-all record ("v=spf1 -all") which indicates # no emails are sent from this domain (similar to a NULL MX record @@ -87,9 +89,18 @@ def validate_email_deliverability(domain, domain_i18n, timeout=None, dns_resolve except dns.resolver.NoAnswer: # No TXT records means there is no SPF policy, so we cannot take any action. pass - except (dns.resolver.NoNameservers, dns.resolver.NXDOMAIN): - # Failure to resolve at this step will be ignored. - pass + + except dns.resolver.NXDOMAIN: + # The domain name does not exist --- there are no records of any sort + # for the domain name. + raise EmailUndeliverableError("The domain name %s does not exist." % domain_i18n) + + except dns.resolver.NoNameservers: + # All nameservers failed to answer the query. This might be a problem + # with local nameservers, maybe? We'll allow the domain to go through. + return { + "unknown-deliverability": "no_nameservers", + } except dns.exception.Timeout: # A timeout could occur for various reasons, so don't treat it as a failure. diff --git a/tests/mocked-dns-answers.json b/tests/mocked-dns-answers.json index 39c95a1..d5f6761 100644 --- a/tests/mocked-dns-answers.json +++ b/tests/mocked-dns-answers.json @@ -16,7 +16,7 @@ { "query": { "name": "xkxufoekjvjfjeodlfmdfjcu.com", - "type": "ANY", + "type": "MX", "class": "IN" }, "answer": [] @@ -24,7 +24,7 @@ { "query": { "name": "xkxufoekjvjfjeodlfmdfjcu.com", - "type": "AAAA", + "type": "ANY", "class": "IN" }, "answer": [] @@ -84,18 +84,10 @@ }, "answer": [] }, - { - "query": { - "name": "mail.example", - "type": "AAAA", - "class": "IN" - }, - "answer": [] - }, { "query": { "name": "mail.example.com", - "type": "ANY", + "type": "MX", "class": "IN" }, "answer": [] @@ -103,7 +95,7 @@ { "query": { "name": "mail.example.com", - "type": "AAAA", + "type": "ANY", "class": "IN" }, "answer": [] diff --git a/tests/mocked_dns_response.py b/tests/mocked_dns_response.py index 2ffabf6..124f208 100644 --- a/tests/mocked_dns_response.py +++ b/tests/mocked_dns_response.py @@ -101,7 +101,7 @@ def get(self, key): # Query the data for a response to an ANY query. ANY = dns.rdatatype.from_text("ANY") if (key[0], ANY, key[2]) in self.data and self.data[(key[0], ANY, key[2])] is None: - raise dns.resolver.NoAnswer() + raise dns.resolver.NXDOMAIN() raise ValueError("Saved DNS data did not contain query: {}".format(key)) From 92596f2c54f1295b24cd91e6cc3dfec4cd9f2e25 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sat, 11 Feb 2023 10:01:28 -0500 Subject: [PATCH 071/174] Add forgotten pytest marker registration See 52f2ab7c. --- setup.cfg | 2 ++ 1 file changed, 2 insertions(+) diff --git a/setup.cfg b/setup.cfg index e35bc55..026c63b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -42,3 +42,5 @@ max-line-length = 120 testpaths = tests filterwarnings = error +markers = + network: mark a test as requiring Internet access. From 88576acd7c62ee1d9b53e0f1a86d7a71d894a7e8 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sat, 11 Feb 2023 10:04:46 -0500 Subject: [PATCH 072/174] Mark support for Python 3.11 Travis previously gave an error, but Travis won't run for me anymore without paying, so I can't reproduce the issue. Tests pass in a python:3.11-slim docker container. Fixes #97 --- .travis.yml | 2 +- setup.cfg | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index e55ca23..f09290d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,7 +9,7 @@ python: - '3.8' - '3.9' - '3.10' -#- '3.11' +- '3.11' - '3.12-dev' install: diff --git a/setup.cfg b/setup.cfg index 026c63b..65a0a3e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -18,6 +18,7 @@ classifiers = Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 Programming Language :: Python :: 3.10 + Programming Language :: Python :: 3.11 Topic :: Software Development :: Libraries :: Python Modules keywords = email address validator From b4814e3d0f4f1e25720a8ea17e393dee58e38519 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sat, 11 Feb 2023 10:22:50 -0500 Subject: [PATCH 073/174] Add some CHANGELOG entries --- CHANGELOG.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c256496..6d523dc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,13 @@ In Development -------------- -* Python versions through 3.5 and dnspython 1.x are no longer supported. Python 3.6+ with dnspython 2.x are now required. +* Python versions through 3.5 and dnspython 1.x are no longer supported. Python 3.6+ (now through Python 3.11) with dnspython 2.x are now required. +* The dnspython package is no longer required if DNS checks are not used. +* NoNameservers and NXDOMAIN DNS errors are now handled differently: NoNameservers no longer fails validation, and NXDOMAIN now skips checking for an A/AAAA fallback and goes straight to failing validation. +* Some syntax error messages have changed because they are now checked explicitly rather than as a part of other checks. +* Some other error messages have changed to not repeat the email address in the error message. * The library has been reorganized internally into smaller modules. +* The tests have been reorganized and expanded. Deliverability tests now mostly use captured DNS responses so they can be run off-line. Version 1.3.1 (January 21, 2023) -------------------------------- From edf23bc51af33f7586fffa50464685572ad9c810 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Mon, 27 Feb 2023 17:19:25 -0500 Subject: [PATCH 074/174] Drop support for Python 3.6 which is EOL'd already --- .travis.yml | 1 - CHANGELOG.md | 2 +- README.md | 2 +- setup.cfg | 2 +- test_requirements.txt | 30 +++++++++++++++--------------- 5 files changed, 18 insertions(+), 19 deletions(-) diff --git a/.travis.yml b/.travis.yml index f09290d..ec189a2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,7 +4,6 @@ language: python cache: pip python: -- '3.6' - '3.7' - '3.8' - '3.9' diff --git a/CHANGELOG.md b/CHANGELOG.md index 6d523dc..d4366cd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,7 @@ In Development -------------- -* Python versions through 3.5 and dnspython 1.x are no longer supported. Python 3.6+ (now through Python 3.11) with dnspython 2.x are now required. +* Python versions through 3.6 and dnspython 1.x are no longer supported. Python 3.7+ with dnspython 2.x are now required. * The dnspython package is no longer required if DNS checks are not used. * NoNameservers and NXDOMAIN DNS errors are now handled differently: NoNameservers no longer fails validation, and NXDOMAIN now skips checking for an A/AAAA fallback and goes straight to failing validation. * Some syntax error messages have changed because they are now checked explicitly rather than as a part of other checks. diff --git a/README.md b/README.md index a483bc8..c0aec1a 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ email-validator: Validate Email Addresses ========================================= A robust email address syntax and deliverability validation library for -Python 3.6+ by [Joshua Tauberer](https://joshdata.me). +Python 3.7+ by [Joshua Tauberer](https://joshdata.me). This library validates that a string is of the form `name@example.com` and optionally checks that the domain name is set up to receive email. This is the sort of validation you would want for an email-based registration form on diff --git a/setup.cfg b/setup.cfg index 65a0a3e..6b62044 100644 --- a/setup.cfg +++ b/setup.cfg @@ -27,7 +27,7 @@ packages = find: install_requires = dnspython>=2.0.0 # optional if deliverability check isn't needed idna>=2.0.0 -python_requires = >=3.6 +python_requires = >=3.7 [options.entry_points] console_scripts = diff --git a/test_requirements.txt b/test_requirements.txt index f793d1c..e623d5c 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -1,27 +1,27 @@ -# This file was generated by running -# sudo docker run --rm -it --network=host python:3.6.15-slim /bin/bash +# This file was generated by running: +# sudo docker run --rm -it --network=host python:3.7-slim /bin/bash # pip install dnspython idna # from setup.cfg # pip install pytest pytest-cov coverage flake8 # pip freeze -# in a virtualenv with Python 3.6. (Some packages' latest versions -# are not compatible with Python 3.6, so we must pin versions for -# repeatable testing in earlier versions of Python.) +# (Some packages' latest versions may not be compatible with +# the earliest Python version we support, and some exception +# messages may depend on package versions, so we pin versions +# for reproducible testing.) attrs==22.2.0 -coverage==6.2 -dnspython==2.2.1 +coverage==7.2.1 +dnspython==2.3.0 +exceptiongroup==1.1.0 flake8==5.0.4 idna==3.4 importlib-metadata==4.2.0 -iniconfig==1.1.1 +iniconfig==2.0.0 mccabe==0.7.0 -packaging==21.3 +packaging==23.0 pluggy==1.0.0 -py==1.11.0 pycodestyle==2.9.1 pyflakes==2.5.0 -pyparsing==3.0.9 -pytest==7.0.1 +pytest==7.2.1 pytest-cov==4.0.0 -tomli==1.2.3 -typing_extensions==4.1.1 -zipp==3.6.0 +tomli==2.0.1 +typing_extensions==4.5.0 +zipp==3.15.0 From 18e880cc41a0830892f889aaf6fdc14dc5afcbac Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Mon, 27 Feb 2023 17:35:07 -0500 Subject: [PATCH 075/174] Add GitHub Actions workflow since TravisCI stopped working --- .github/workflows/test_and_build.yaml | 28 +++++++++++++++++++++++++++ Makefile | 2 +- README.md | 2 +- 3 files changed, 30 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/test_and_build.yaml diff --git a/.github/workflows/test_and_build.yaml b/.github/workflows/test_and_build.yaml new file mode 100644 index 0000000..40314b5 --- /dev/null +++ b/.github/workflows/test_and_build.yaml @@ -0,0 +1,28 @@ +name: Tests + +on: [push] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12.0-alpha.5"] + + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r test_requirements.txt + - name: Lint with flake8 + run: | + make lint + - name: Test with pytest + run: | + make test diff --git a/Makefile b/Makefile index 71f8600..9226591 100644 --- a/Makefile +++ b/Makefile @@ -13,7 +13,7 @@ lint: .PHONY: test test: - pytest --cov=email_validator + PYTHONPATH=.:$PYTHONPATH pytest --cov=email_validator .PHONY: testcov testcov: test diff --git a/README.md b/README.md index c0aec1a..10ce952 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ And this library does NOT permit obsolete forms of email addresses, so if you need strict validation against the email specs exactly, use [pyIsEmail](https://github.com/michaelherold/pyIsEmail). -[![Build Status](https://app.travis-ci.com/JoshData/python-email-validator.svg?branch=main)](https://app.travis-ci.com/JoshData/python-email-validator) +[![Build Status](https://github.com/JoshData/python-email-validator/actions/workflows/test_and_build.yaml/badge.svg)](https://github.com/JoshData/python-email-validator/actions/workflows/test_and_build.yaml) View the [CHANGELOG / Release Notes](CHANGELOG.md) for the version history of changes in the library. Occasionally this README is ahead of the latest published package --- see the CHANGELOG for details. From 210c661cfd4be73132282916e729ced99e5a051e Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Tue, 28 Feb 2023 09:54:41 -0500 Subject: [PATCH 076/174] Improve exception messages related to the dot-atom rule and also invalid and unsafe characters * Check for invalid and (now) unsafe characters in domain names before going to IDNA parsing so we can be more sure we don't generate unsafe strings in error messages. It's also clearer. But these were probably invalid anyway per IDNA rules. * Check for bad characters in local parts before using the dot-atom regex because the regex can fail because of invalid dot usage and the error message wouldn't indicate that. * Check for invalid dot usage in the local part explicitly to improve error messages. Add tests. * Use a safe and intelligible (no Python escape codes) representation of invalid characters in error messages. --- email_validator/rfc_constants.py | 5 +- email_validator/syntax.py | 138 ++++++++++++++++++++++--------- tests/test_syntax.py | 65 +++++++++------ 3 files changed, 141 insertions(+), 67 deletions(-) diff --git a/email_validator/rfc_constants.py b/email_validator/rfc_constants.py index afe0982..82bc726 100644 --- a/email_validator/rfc_constants.py +++ b/email_validator/rfc_constants.py @@ -21,8 +21,9 @@ # must also satisfy the requirements of RFC 952/RFC 1123 which restrict # the allowed characters of hostnames further. The hyphen cannot be at # the beginning or end of a *dot-atom component* of a hostname either. -ATEXT_HOSTNAME = r'(?:(?:[a-zA-Z0-9][a-zA-Z0-9\-]*)?[a-zA-Z0-9])' -DOT_ATOM_TEXT_HOSTNAME = re.compile(ATEXT_HOSTNAME + r'(?:\.' + ATEXT_HOSTNAME + r')*\Z') +ATEXT_HOSTNAME_INTL = re.compile(r"[a-zA-Z0-9\-\." + "\u0080-\U0010FFFF" + "]") +HOSTNAME_LABEL = r'(?:(?:[a-zA-Z0-9][a-zA-Z0-9\-]*)?[a-zA-Z0-9])' +DOT_ATOM_TEXT_HOSTNAME = re.compile(HOSTNAME_LABEL + r'(?:\.' + HOSTNAME_LABEL + r')*\Z') DOMAIN_NAME_REGEX = re.compile(r"[A-Za-z]\Z") # all TLDs currently end with a letter # Length constants diff --git a/email_validator/syntax.py b/email_validator/syntax.py index e58aadf..9d57ddb 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -1,6 +1,6 @@ from .exceptions_types import EmailSyntaxError from .rfc_constants import EMAIL_MAX_LENGTH, LOCAL_PART_MAX_LENGTH, DOMAIN_MAX_LENGTH, \ - DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT, ATEXT_INTL, DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX + DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT, ATEXT_INTL, ATEXT_HOSTNAME_INTL, DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX import re import unicodedata @@ -16,6 +16,21 @@ def get_length_reason(addr, utf8=False, limit=EMAIL_MAX_LENGTH): return reason.format(prefix, diff, suffix) +def safe_character_display(c): + # Return safely displayable characters in quotes. + if unicodedata.category(c)[0] in ("L", "N", "P", "S"): + return repr(c) + + # Construct a hex string in case the unicode name doesn't exist. + if ord(c) < 0xFFFF: + h = "U+{:04x}".format(ord(c)).upper() + else: + h = "U+{:08x}".format(ord(c)).upper() + + # Return the character name or, if it has no name, the hex string. + return unicodedata.name(c, h) + + def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=False): """Validates the syntax of the local part of an email address.""" @@ -41,6 +56,19 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals reason = get_length_reason(local, limit=LOCAL_PART_MAX_LENGTH) raise EmailSyntaxError("The email address is too long before the @-sign {}.".format(reason)) + # Check for invalid characters. + atext_re = re.compile('[.' + (ATEXT if not allow_smtputf8 else ATEXT_INTL) + ']') + bad_chars = set( + safe_character_display(c) + for c in local + if not atext_re.match(c) + ) + if bad_chars: + raise EmailSyntaxError("The email address contains invalid characters before the @-sign: " + ", ".join(sorted(bad_chars)) + ".") + + # Check for dot errors imposted by the dot-atom rule. + check_dot_atom(local, 'An email address cannot start with a {}.', 'An email address cannot have a {} immediately before the @-sign.', is_hostname=False) + # Check the local part against the regular expression for the older ASCII requirements. m = DOT_ATOM_TEXT.match(local) if m: @@ -53,14 +81,10 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals else: # The local part failed the ASCII check. Now try the extended internationalized requirements. + # This should already be handled by the bad_chars and check_dot_atom tests above. m = DOT_ATOM_TEXT_INTL.match(local) if not m: - # It's not a valid internationalized address either. Report which characters were not valid. - bad_chars = ', '.join(sorted(set( - unicodedata.name(c, repr(c)) for c in local if not re.match(u"[" + (ATEXT if not allow_smtputf8 else ATEXT_INTL) + u"]", c) - ))) - raise EmailSyntaxError("The email address contains invalid characters before the @-sign: %s." % bad_chars) - + raise EmailSyntaxError("The email address contains invalid characters before the @-sign.") # It would be valid if internationalized characters were allowed by the caller. if not allow_smtputf8: raise EmailSyntaxError("Internationalized characters before the @-sign are not supported.") @@ -74,28 +98,7 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals # Check for unsafe characters. # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked # by DOT_ATOM_TEXT_INTL. - for i, c in enumerate(local): - category = unicodedata.category(c) - if category[0] in ("L", "N", "P", "S"): - # letters, numbers, punctuation, and symbols are permitted - pass - elif category[0] == "M": - # combining character in first position would combine with something - # outside of the email address if concatenated to the right, but are - # otherwise permitted - if i == 0: - raise EmailSyntaxError("The email address contains an initial invalid character (%s)." - % unicodedata.name(c, repr(c))) - elif category[0] in ("Z", "C"): - # spaces and line/paragraph characters (Z) and - # control, format, surrogate, private use, and unassigned code points (C) - raise EmailSyntaxError("The email address contains an invalid character (%s)." - % unicodedata.name(c, repr(c))) - else: - # All categories should be handled above, but in case there is something new - # in the future. - raise EmailSyntaxError("The email address contains a character (%s; category %s) that may not be safe." - % (unicodedata.name(c, repr(c)), category)) + check_unsafe_chars(local) # Try encoding to UTF-8. Failure is possible with some characters like # surrogate code points, but those are checked above. Still, we don't @@ -113,6 +116,48 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals } +def check_unsafe_chars(s): + bad_chars = set() + for i, c in enumerate(s): + category = unicodedata.category(c) + if category[0] in ("L", "N", "P", "S"): + # letters, numbers, punctuation, and symbols are permitted + pass + elif category[0] == "M": + # combining character in first position would combine with something + # outside of the email address if concatenated to the right, but are + # otherwise permitted + if i == 0: + bad_chars.add(c) + elif category[0] in ("Z", "C"): + # spaces and line/paragraph characters (Z) and + # control, format, surrogate, private use, and unassigned code points (C) + bad_chars.add(c) + else: + # All categories should be handled above, but in case there is something new + # in the future. + bad_chars.add(c) + if bad_chars: + raise EmailSyntaxError("The email address contains unsafe characters: " + + ", ".join(safe_character_display(c) for c in sorted(bad_chars)) + ".") + + +def check_dot_atom(label, start_descr, end_descr, is_hostname): + if label.endswith("."): + raise EmailSyntaxError(end_descr.format("period")) + if label.startswith("."): + raise EmailSyntaxError(start_descr.format("period")) + if ".." in label: + raise EmailSyntaxError("An email address cannot have two periods in a row.") + if is_hostname: + if label.endswith("-"): + raise EmailSyntaxError(end_descr.format("hyphen")) + if label.startswith("-"): + raise EmailSyntaxError(start_descr.format("hyphen")) + if ".-" in label or "-." in label: + raise EmailSyntaxError("An email address cannot have a period and a hyphen next to each other.") + + def validate_email_domain_part(domain, test_environment=False, globally_deliverable=True): """Validates the syntax of the domain part of an email address.""" @@ -120,6 +165,16 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera if len(domain) == 0: raise EmailSyntaxError("There must be something after the @-sign.") + # Check for invalid characters before normalization. + bad_chars = set( + safe_character_display(c) + for c in domain + if not ATEXT_HOSTNAME_INTL.match(c) + ) + if bad_chars: + raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".") + check_unsafe_chars(domain) + # Perform UTS-46 normalization, which includes casefolding, NFC normalization, # and converting all label separators (the period/full stop, fullwidth full stop, # ideographic full stop, and halfwidth ideographic full stop) to basic periods. @@ -136,23 +191,13 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera # Check that before we do IDNA encoding because the IDNA library gives # unfriendly errors for these cases, but after UTS-46 normalization because # it can insert periods and hyphens (from fullwidth characters). - if domain.endswith("."): - raise EmailSyntaxError("An email address cannot end with a period.") - if domain.startswith("."): - raise EmailSyntaxError("An email address cannot have a period immediately after the @-sign.") - if ".." in domain: - raise EmailSyntaxError("An email address cannot have two periods in a row.") - if domain.endswith("-"): - raise EmailSyntaxError("An email address cannot end with a hyphen.") - if domain.startswith("-"): - raise EmailSyntaxError("An email address cannot have a hyphen immediately after the @-sign.") - if ".-" in domain or "-." in domain: - raise EmailSyntaxError("An email address cannot have a period and a hyphen next to each other.") + check_dot_atom(domain, 'An email address cannot have a {} immediately after the @-sign.', 'An email address cannot end with a {}.', is_hostname=True) for label in domain.split("."): if re.match(r"(?!xn)..--", label, re.I): # RFC 5890 invalid R-LDH labels raise EmailSyntaxError("An email address cannot have two letters followed by two dashes immediately after the @-sign or after a period, except Punycode.") if DOT_ATOM_TEXT_HOSTNAME.match(domain): + # This is a valid non-internationalized domain. ascii_domain = domain else: # If international characters are present in the domain name, convert @@ -236,6 +281,17 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera except idna.IDNAError as e: raise EmailSyntaxError("The part after the @-sign is not valid IDNA ({}).".format(str(e))) + # Check for invalid characters after normalization. These + # should never arise. + bad_chars = set( + safe_character_display(c) + for c in domain + if not ATEXT_HOSTNAME_INTL.match(c) + ) + if bad_chars: + raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".") + check_unsafe_chars(domain) + # Return the IDNA ASCII-encoded form of the domain, which is how it # would be transmitted on the wire (except when used with SMTPUTF8 # possibly), as well as the canonical Unicode form of the domain, diff --git a/tests/test_syntax.py b/tests/test_syntax.py index 0bd01d6..f60bf50 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -226,21 +226,21 @@ def test_email_valid(email_input, output): ('my@baddashfw.-a.com', 'An email address cannot have a period and a hyphen next to each other.'), ('my@baddashfw.b-.com', 'An email address cannot have a period and a hyphen next to each other.'), ('my@example.com\n', - 'The part after the @-sign contains invalid characters (Codepoint U+000A at position 4 of ' - '\'com\\n\' not allowed).'), + 'The part after the @-sign contains invalid characters: U+000A.'), ('my@example\n.com', - 'The part after the @-sign contains invalid characters (Codepoint U+000A at position 8 of ' - '\'example\\n\' not allowed).'), - ('.leadingdot@domain.com', 'The email address contains invalid characters before the @-sign: FULL STOP.'), - ('..twodots@domain.com', 'The email address contains invalid characters before the @-sign: FULL STOP.'), - ('twodots..here@domain.com', 'The email address contains invalid characters before the @-sign: FULL STOP.'), + 'The part after the @-sign contains invalid characters: U+000A.'), + ('me@x!', 'The part after the @-sign contains invalid characters: \'!\'.'), + ('me@x ', 'The part after the @-sign contains invalid characters: SPACE.'), + ('.leadingdot@domain.com', 'An email address cannot start with a period.'), + ('twodots..here@domain.com', 'An email address cannot have two periods in a row.'), + ('trailingdot.@domain.email', 'An email address cannot have a period immediately before the @-sign.'), ('me@⒈wouldbeinvalid.com', "The part after the @-sign contains invalid characters (Codepoint U+2488 not allowed " "at position 1 in '⒈wouldbeinvalid.com')."), ('@example.com', 'There must be something before the @-sign.'), - ('\nmy@example.com', 'The email address contains invalid characters before the @-sign: \'\\n\'.'), - ('m\ny@example.com', 'The email address contains invalid characters before the @-sign: \'\\n\'.'), - ('my\n@example.com', 'The email address contains invalid characters before the @-sign: \'\\n\'.'), + ('\nmy@example.com', 'The email address contains invalid characters before the @-sign: U+000A.'), + ('m\ny@example.com', 'The email address contains invalid characters before the @-sign: U+000A.'), + ('my\n@example.com', 'The email address contains invalid characters before the @-sign: U+000A.'), ('11111111112222222222333333333344444444445555555555666666666677777@example.com', 'The email address is too long before the @-sign (1 character too many).'), ('111111111122222222223333333333444444444455555555556666666666777777@example.com', 'The email address is too long before the @-sign (2 characters too many).'), ('me@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.111111111122222222223333333333444444444455555555556.com', 'The email address is too long (4 characters too many).'), @@ -253,7 +253,6 @@ def test_email_valid(email_input, output): ('my.λong.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (at least 1 character too many).'), ('me@bad-tld-1', 'The part after the @-sign is not valid. It should have a period.'), ('me@bad.tld-2', 'The part after the @-sign is not valid. It is not within a valid top-level domain.'), - ('me@x!', 'The part after the @-sign contains invalid characters (Codepoint U+0021 at position 2 of \'x!\' not allowed).'), ('me@xn--0.tld', 'The part after the @-sign is not valid IDNA (Invalid A-label).'), ('me@yy--0.tld', 'An email address cannot have two letters followed by two dashes immediately after the @-sign or after a period, except Punycode.'), ('me@yy--0.tld', 'An email address cannot have two letters followed by two dashes immediately after the @-sign or after a period, except Punycode.'), @@ -289,25 +288,43 @@ def test_email_invalid_reserved_domain(email_input): @pytest.mark.parametrize( - 'email_input', + ('s', 'expected_error'), + [ + ('\u2005', 'FOUR-PER-EM SPACE'), # four-per-em space (Zs) + ('\u0300', 'COMBINING GRAVE ACCENT'), # grave accent (M) + ('\u009C', 'U+009C'), # string terminator (Cc) + ('\u200B', 'ZERO WIDTH SPACE'), # zero-width space (Cf) + ('\u202Dforward-\u202Ereversed', 'LEFT-TO-RIGHT OVERRIDE, RIGHT-TO-LEFT OVERRIDE'), # BIDI (Cf) + ('\uD800', 'U+D800'), # surrogate (Cs) + ('\uE000', 'U+E000'), # private use (Co) + ('\U0010FDEF', 'U+0010FDEF'), # priate use (Co) + ('\uFDEF', 'U+FDEF'), # unassigned (Cn) + ], +) +def test_email_unsafe_character(s, expected_error): + # Check for various unsafe characters: + + with pytest.raises(EmailSyntaxError) as exc_info: + validate_email(s + "@test", test_environment=True) + assert str(exc_info.value) == f"The email address contains unsafe characters: {expected_error}." + + with pytest.raises(EmailSyntaxError) as exc_info: + validate_email("test@" + s, test_environment=True) + assert "The email address contains unsafe characters" in str(exc_info.value) + + +@pytest.mark.parametrize( + ('email_input', 'expected_error'), [ - ('white space@test'), - ('\n@test'), - ('\u2005@test'), # four-per-em space (Zs) - ('\u009C@test'), # string terminator (Cc) - ('\u200B@test'), # zero-width space (Cf) - ('\u202Dforward-\u202Ereversed@test'), # BIDI (Cf) - ('\uD800@test'), # surrogate (Cs) - ('\uE000@test'), # private use (Co) - ('\uFDEF@test'), # unassigned (Cn) - ('\u0300@test'), # grave accent (M) + ('white space@test', 'The email address contains invalid characters before the @-sign: SPACE.'), + ('\n@test', 'The email address contains invalid characters before the @-sign: U+000A.'), ], ) -def test_email_unsafe_character(email_input): +def test_email_invalid_character(email_input, expected_error): # Check for various unsafe characters: with pytest.raises(EmailSyntaxError) as exc_info: validate_email(email_input, test_environment=True) - assert "invalid character" in str(exc_info.value) + assert str(exc_info.value) == expected_error def test_email_test_domain_name_in_test_environment(): From d6a5d4b6870ff34a8ef3799174d6fe69776445f1 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Wed, 1 Mar 2023 09:19:49 -0500 Subject: [PATCH 077/174] Add more citations throughout the library --- email_validator/deliverability.py | 16 ++++--- email_validator/rfc_constants.py | 9 ++-- email_validator/syntax.py | 78 +++++++++++++++++++++++-------- 3 files changed, 72 insertions(+), 31 deletions(-) diff --git a/email_validator/deliverability.py b/email_validator/deliverability.py index 9616afc..19eee65 100644 --- a/email_validator/deliverability.py +++ b/email_validator/deliverability.py @@ -34,15 +34,16 @@ def validate_email_deliverability(domain, domain_i18n, timeout=None, dns_resolve try: try: - # Try resolving for MX records. + # Try resolving for MX records (RFC 5321 Section 5). response = dns_resolver.resolve(domain, "MX") # For reporting, put them in priority order and remove the trailing dot in the qnames. mtas = sorted([(r.preference, str(r.exchange).rstrip('.')) for r in response]) - # Remove "null MX" records from the list (their value is (0, ".") but we've stripped - # trailing dots, so the 'exchange' is just ""). If there was only a null MX record, - # email is not deliverable. + # RFC 7505: Null MX (0, ".") records signify the domain does not accept email. + # Remove null MX records from the mtas list (but we've stripped trailing dots, + # so the 'exchange' is just "") so we can check if there are no non-null MX + # records remaining. mtas = [(preference, exchange) for preference, exchange in mtas if exchange != ""] if len(mtas) == 0: # null MX only, if there were no MX records originally a NoAnswer exception would have occurred @@ -52,7 +53,7 @@ def validate_email_deliverability(domain, domain_i18n, timeout=None, dns_resolve deliverability_info["mx_fallback_type"] = None except dns.resolver.NoAnswer: - # If there was no MX record, fall back to an A record, as SMTP servers do. + # If there was no MX record, fall back to an A record. (RFC 5321 Section 5) try: response = dns_resolver.resolve(domain, "A") deliverability_info["mx"] = [(0, str(r)) for r in response] @@ -61,6 +62,7 @@ def validate_email_deliverability(domain, domain_i18n, timeout=None, dns_resolve except dns.resolver.NoAnswer: # If there was no A record, fall back to an AAAA record. + # (It's unclear if SMTP servers actually do this.) try: response = dns_resolver.resolve(domain, "AAAA") deliverability_info["mx"] = [(0, str(r)) for r in response] @@ -73,8 +75,8 @@ def validate_email_deliverability(domain, domain_i18n, timeout=None, dns_resolve # have been raised). raise EmailUndeliverableError("The domain name %s does not accept email." % domain_i18n) - # Check for a SPF reject-all record ("v=spf1 -all") which indicates - # no emails are sent from this domain (similar to a NULL MX record + # Check for a SPF (RFC 7208) reject-all record ("v=spf1 -all") which indicates + # no emails are sent from this domain (similar to a Null MX record # but for sending rather than receiving). In combination with the # absence of an MX record, this is probably a good sign that the # domain is not used for email. diff --git a/email_validator/rfc_constants.py b/email_validator/rfc_constants.py index 82bc726..bf21a9c 100644 --- a/email_validator/rfc_constants.py +++ b/email_validator/rfc_constants.py @@ -18,9 +18,8 @@ DOT_ATOM_TEXT_INTL = re.compile('[' + ATEXT_INTL + ']+(?:\\.[' + ATEXT_INTL + r']+)*\Z') # The domain part of the email address, after IDNA (ASCII) encoding, -# must also satisfy the requirements of RFC 952/RFC 1123 which restrict -# the allowed characters of hostnames further. The hyphen cannot be at -# the beginning or end of a *dot-atom component* of a hostname either. +# must also satisfy the requirements of RFC 952/RFC 1123 Section 2.1 which +# restrict the allowed characters of hostnames further. ATEXT_HOSTNAME_INTL = re.compile(r"[a-zA-Z0-9\-\." + "\u0080-\U0010FFFF" + "]") HOSTNAME_LABEL = r'(?:(?:[a-zA-Z0-9][a-zA-Z0-9\-]*)?[a-zA-Z0-9])' DOT_ATOM_TEXT_HOSTNAME = re.compile(HOSTNAME_LABEL + r'(?:\.' + HOSTNAME_LABEL + r')*\Z') @@ -31,5 +30,5 @@ # explains the maximum length of an email address is 254 octets. EMAIL_MAX_LENGTH = 254 LOCAL_PART_MAX_LENGTH = 64 -DNS_LABEL_LENGTH_LIMIT = 63 # RFC 1035 2.3.1 -DOMAIN_MAX_LENGTH = 255 # RFC 1035 2.3.4 +DNS_LABEL_LENGTH_LIMIT = 63 # in "octets", RFC 1035 2.3.1 +DOMAIN_MAX_LENGTH = 255 # in "octets", RFC 1035 2.3.4 and RFC 5321 4.5.3.1.2 diff --git a/email_validator/syntax.py b/email_validator/syntax.py index 9d57ddb..cf7c304 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -46,7 +46,8 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals "smtputf8": False, } - # RFC 5321 4.5.3.1.1 + # Check the length of the local part by couting characters. + # (RFC 5321 4.5.3.1.1) # We're checking the number of characters here. If the local part # is ASCII-only, then that's the same as bytes (octets). If it's # internationalized, then the UTF-8 encoding may be longer, but @@ -57,6 +58,8 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals raise EmailSyntaxError("The email address is too long before the @-sign {}.".format(reason)) # Check for invalid characters. + # (RFC 2822 Section 3.2.4 / RFC 5322 Section 3.2.3, plus RFC 6531 section 3.3 + # if internationalized local parts are allowed) atext_re = re.compile('[.' + (ATEXT if not allow_smtputf8 else ATEXT_INTL) + ']') bad_chars = set( safe_character_display(c) @@ -67,9 +70,11 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals raise EmailSyntaxError("The email address contains invalid characters before the @-sign: " + ", ".join(sorted(bad_chars)) + ".") # Check for dot errors imposted by the dot-atom rule. + # (RFC 2822 3.2.4) check_dot_atom(local, 'An email address cannot start with a {}.', 'An email address cannot have a {} immediately before the @-sign.', is_hostname=False) - # Check the local part against the regular expression for the older ASCII requirements. + # Check the local part against the non-internationalized regular expression. + # (RFC 2822 3.2.4) m = DOT_ATOM_TEXT.match(local) if m: # Return the local part unchanged and flag that SMTPUTF8 is not needed. @@ -82,6 +87,7 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals else: # The local part failed the ASCII check. Now try the extended internationalized requirements. # This should already be handled by the bad_chars and check_dot_atom tests above. + # It's the same pattern but with additional characters permitted. m = DOT_ATOM_TEXT_INTL.match(local) if not m: raise EmailSyntaxError("The email address contains invalid characters before the @-sign.") @@ -97,7 +103,8 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals # Check for unsafe characters. # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked - # by DOT_ATOM_TEXT_INTL. + # by DOT_ATOM_TEXT_INTL. Other characters may be permitted by the email specs, but + # they may not be valid, safe, or sensible Unicode strings. check_unsafe_chars(local) # Try encoding to UTF-8. Failure is possible with some characters like @@ -117,25 +124,39 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals def check_unsafe_chars(s): + # Check for unsafe characters or characters that would make the string + # invalid or non-sensible Unicode. bad_chars = set() for i, c in enumerate(s): category = unicodedata.category(c) if category[0] in ("L", "N", "P", "S"): - # letters, numbers, punctuation, and symbols are permitted + # Letters, numbers, punctuation, and symbols are permitted. pass elif category[0] == "M": - # combining character in first position would combine with something - # outside of the email address if concatenated to the right, but are - # otherwise permitted + # Combining character in first position would combine with something + # outside of the email address if concatenated, so they are not safe. + # We also check if this occurs after the @-sign, which would not be + # sensible. if i == 0: bad_chars.add(c) - elif category[0] in ("Z", "C"): - # spaces and line/paragraph characters (Z) and - # control, format, surrogate, private use, and unassigned code points (C) + elif category[0] == "Z": + # Spaces and line/paragraph characters (Z) outside of the ASCII range + # are not specifically disallowed as far as I can tell, but they + # violate the spirit of the non-internationalized specification that + # email addresses do not contain spaces or line breaks when not quoted. + bad_chars.add(c) + elif category[0] == "C": + # Control, format, surrogate, private use, and unassigned code points (C) + # are all unsafe in various ways. Control and format characters can affect + # text rendering if the email address is concatenated with other text. + # Bidirectional format characters are unsafe, even if used properly, because + # they cause an email address to render as a different email address. + # Private use characters do not make sense for publicly deliverable + # email addresses. bad_chars.add(c) else: # All categories should be handled above, but in case there is something new - # in the future. + # to the Unicode specification in the future, reject all other categories. bad_chars.add(c) if bad_chars: raise EmailSyntaxError("The email address contains unsafe characters: " @@ -143,13 +164,16 @@ def check_unsafe_chars(s): def check_dot_atom(label, start_descr, end_descr, is_hostname): + # RFC 2822 3.2.4 if label.endswith("."): raise EmailSyntaxError(end_descr.format("period")) if label.startswith("."): raise EmailSyntaxError(start_descr.format("period")) if ".." in label: raise EmailSyntaxError("An email address cannot have two periods in a row.") + if is_hostname: + # RFC 952 if label.endswith("-"): raise EmailSyntaxError(end_descr.format("hyphen")) if label.startswith("-"): @@ -166,6 +190,7 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera raise EmailSyntaxError("There must be something after the @-sign.") # Check for invalid characters before normalization. + # (RFC 952 plus RFC 6531 section 3.3 for internationalized addresses) bad_chars = set( safe_character_display(c) for c in domain @@ -173,6 +198,11 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera ) if bad_chars: raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".") + + # Check for unsafe characters. + # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked + # by DOT_ATOM_TEXT_INTL. Other characters may be permitted by the email specs, but + # they may not be valid, safe, or sensible Unicode strings. check_unsafe_chars(domain) # Perform UTS-46 normalization, which includes casefolding, NFC normalization, @@ -191,9 +221,13 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera # Check that before we do IDNA encoding because the IDNA library gives # unfriendly errors for these cases, but after UTS-46 normalization because # it can insert periods and hyphens (from fullwidth characters). + # (RFC 952, RFC 2822 3.2.4) check_dot_atom(domain, 'An email address cannot have a {} immediately after the @-sign.', 'An email address cannot end with a {}.', is_hostname=True) + + # Check for RFC 5890's invalid R-LDH labels, which are labels that start + # with two characters other than "xn" and two dashes. for label in domain.split("."): - if re.match(r"(?!xn)..--", label, re.I): # RFC 5890 invalid R-LDH labels + if re.match(r"(?!xn)..--", label, re.I): raise EmailSyntaxError("An email address cannot have two letters followed by two dashes immediately after the @-sign or after a period, except Punycode.") if DOT_ATOM_TEXT_HOSTNAME.match(domain): @@ -230,8 +264,9 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera if not m: raise EmailSyntaxError("The email address contains invalid characters after the @-sign after IDNA encoding.") - # RFC 5321 4.5.3.1.2 - # We're checking the number of bytes (octets) here, which can be much + # Check the length of the domain name in bytes. + # (RFC 1035 2.3.4 and RFC 5321 4.5.3.1.2) + # We're checking the number of bytes ("octets") here, which can be much # higher than the number of characters in internationalized domains, # on the assumption that the domain may be transmitted without SMTPUTF8 # as IDNA ASCII. (This is also checked by idna.encode, so this exception @@ -239,14 +274,19 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera if len(ascii_domain) > DOMAIN_MAX_LENGTH: reason = get_length_reason(ascii_domain, limit=DOMAIN_MAX_LENGTH) raise EmailSyntaxError("The email address is too long after the @-sign {}.".format(reason)) + + # Also check the label length limit. + # (RFC 1035 2.3.1) for label in ascii_domain.split("."): if len(label) > DNS_LABEL_LENGTH_LIMIT: reason = get_length_reason(label, limit=DNS_LABEL_LENGTH_LIMIT) - raise EmailSyntaxError("On either side of the @-sign, periods cannot be separated by so many characters {}.".format(reason)) + raise EmailSyntaxError("After the @-sign, periods cannot be separated by so many characters {}.".format(reason)) if globally_deliverable: # All publicly deliverable addresses have domain named with at least - # one period, and we'll consider the lack of a period a syntax error + # one period, at least for gTLDs created since 2013 (per the ICANN Board + # New gTLD Program Committee, https://www.icann.org/en/announcements/details/new-gtld-dotless-domain-names-prohibited-30-8-2013-en). + # We'll consider the lack of a period a syntax error # since that will match people's sense of what an email address looks # like. We'll skip this in test environments to allow '@test' email # addresses. @@ -260,6 +300,7 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera # Check special-use and reserved domain names. # Some might fail DNS-based deliverability checks, but that # can be turned off, so we should fail them all sooner. + # See the references in __init__.py. from . import SPECIAL_USE_DOMAIN_NAMES for d in SPECIAL_USE_DOMAIN_NAMES: # See the note near the definition of SPECIAL_USE_DOMAIN_NAMES. @@ -274,15 +315,14 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera # but not be actual IDNA. For ASCII-only domains, the conversion out # of IDNA just gives the same thing back. # - # This gives us the canonical internationalized form of the domain, - # which we should use in all error messages. + # This gives us the canonical internationalized form of the domain. try: domain_i18n = idna.decode(ascii_domain.encode('ascii')) except idna.IDNAError as e: raise EmailSyntaxError("The part after the @-sign is not valid IDNA ({}).".format(str(e))) # Check for invalid characters after normalization. These - # should never arise. + # should never arise. See the similar checks above. bad_chars = set( safe_character_display(c) for c in domain From 00902f88849ede2a862ce15501938148ae7cce8f Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Fri, 3 Mar 2023 19:14:30 -0500 Subject: [PATCH 078/174] Rearrange the local part syntax checks to put the more likely success conditions first --- email_validator/rfc_constants.py | 2 ++ email_validator/syntax.py | 58 +++++++++++++++++--------------- 2 files changed, 32 insertions(+), 28 deletions(-) diff --git a/email_validator/rfc_constants.py b/email_validator/rfc_constants.py index bf21a9c..802f773 100644 --- a/email_validator/rfc_constants.py +++ b/email_validator/rfc_constants.py @@ -6,6 +6,7 @@ # characters are permitted in email addresses (not taking into # account internationalization): ATEXT = r'a-zA-Z0-9_!#\$%&\'\*\+\-/=\?\^`\{\|\}~' +ATEXT_RE = re.compile('[.' + ATEXT + ']') # ATEXT plus dots # A "dot atom text", per RFC 2822 3.2.4: DOT_ATOM_TEXT = re.compile('[' + ATEXT + ']+(?:\\.[' + ATEXT + r']+)*\Z') @@ -15,6 +16,7 @@ # RFC3629 section 4, which appear to be the Unicode code points from # U+0080 to U+10FFFF. ATEXT_INTL = ATEXT + u"\u0080-\U0010FFFF" +ATEXT_INTL_RE = re.compile('[.' + ATEXT_INTL + ']') # ATEXT_INTL plus dots DOT_ATOM_TEXT_INTL = re.compile('[' + ATEXT_INTL + ']+(?:\\.[' + ATEXT_INTL + r']+)*\Z') # The domain part of the email address, after IDNA (ASCII) encoding, diff --git a/email_validator/syntax.py b/email_validator/syntax.py index cf7c304..c545665 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -1,6 +1,6 @@ from .exceptions_types import EmailSyntaxError from .rfc_constants import EMAIL_MAX_LENGTH, LOCAL_PART_MAX_LENGTH, DOMAIN_MAX_LENGTH, \ - DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT, ATEXT_INTL, ATEXT_HOSTNAME_INTL, DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX + DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT_RE, ATEXT_INTL_RE, ATEXT_HOSTNAME_INTL, DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX import re import unicodedata @@ -57,26 +57,13 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals reason = get_length_reason(local, limit=LOCAL_PART_MAX_LENGTH) raise EmailSyntaxError("The email address is too long before the @-sign {}.".format(reason)) - # Check for invalid characters. - # (RFC 2822 Section 3.2.4 / RFC 5322 Section 3.2.3, plus RFC 6531 section 3.3 - # if internationalized local parts are allowed) - atext_re = re.compile('[.' + (ATEXT if not allow_smtputf8 else ATEXT_INTL) + ']') - bad_chars = set( - safe_character_display(c) - for c in local - if not atext_re.match(c) - ) - if bad_chars: - raise EmailSyntaxError("The email address contains invalid characters before the @-sign: " + ", ".join(sorted(bad_chars)) + ".") - - # Check for dot errors imposted by the dot-atom rule. - # (RFC 2822 3.2.4) - check_dot_atom(local, 'An email address cannot start with a {}.', 'An email address cannot have a {} immediately before the @-sign.', is_hostname=False) - # Check the local part against the non-internationalized regular expression. + # Most email addresses match this regex so it's probably fastest to check this first. # (RFC 2822 3.2.4) m = DOT_ATOM_TEXT.match(local) if m: + # It's valid. + # Return the local part unchanged and flag that SMTPUTF8 is not needed. return { "local_part": local, @@ -84,17 +71,11 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals "smtputf8": False, } - else: - # The local part failed the ASCII check. Now try the extended internationalized requirements. - # This should already be handled by the bad_chars and check_dot_atom tests above. - # It's the same pattern but with additional characters permitted. - m = DOT_ATOM_TEXT_INTL.match(local) - if not m: - raise EmailSyntaxError("The email address contains invalid characters before the @-sign.") - # It would be valid if internationalized characters were allowed by the caller. - if not allow_smtputf8: - raise EmailSyntaxError("Internationalized characters before the @-sign are not supported.") - + # The local part failed the ASCII check. Try the extended character set + # for internationalized addresses. It's the same pattern but with additional + # characters permitted. + m = DOT_ATOM_TEXT_INTL.match(local) + if m and allow_smtputf8: # It's valid. # RFC 6532 section 3.1 also says that Unicode NFC normalization should be applied, @@ -122,6 +103,27 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals "smtputf8": True, } + # It's not a valid local part either non-internationalized or internationalized. + # Let's find out why. + + # Check for invalid characters. + # (RFC 2822 Section 3.2.4 / RFC 5322 Section 3.2.3, plus RFC 6531 section 3.3) + bad_chars = set( + safe_character_display(c) + for c in local + if not (ATEXT_INTL_RE if allow_smtputf8 else ATEXT_RE).match(c) + ) + if bad_chars: + raise EmailSyntaxError("The email address contains invalid characters before the @-sign: " + ", ".join(sorted(bad_chars)) + ".") + + # Check for dot errors imposted by the dot-atom rule. + # (RFC 2822 3.2.4) + check_dot_atom(local, 'An email address cannot start with a {}.', 'An email address cannot have a {} immediately before the @-sign.', is_hostname=False) + + # All of the reasons should already have been checked, but just in case + # we have a fallback message. + raise EmailSyntaxError("The email address contains invalid characters before the @-sign.") + def check_unsafe_chars(s): # Check for unsafe characters or characters that would make the string From 3804cd701076bf5601fdb5cde7adbbac4ddf8561 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Fri, 3 Mar 2023 19:14:56 -0500 Subject: [PATCH 079/174] Explicitly check for non-ASCII characters with better error messages when SMTPUTF8 is off --- email_validator/syntax.py | 20 ++++++++++-- tests/test_syntax.py | 65 +++++++++++++++++++++++++++++---------- 2 files changed, 66 insertions(+), 19 deletions(-) diff --git a/email_validator/syntax.py b/email_validator/syntax.py index c545665..a6d31ce 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -75,9 +75,25 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals # for internationalized addresses. It's the same pattern but with additional # characters permitted. m = DOT_ATOM_TEXT_INTL.match(local) - if m and allow_smtputf8: + if m: # It's valid. + # But international characters in the local part may not be permitted. + if not allow_smtputf8: + # Check for invalid characters against the non-internationalized + # permitted character set. + # (RFC 2822 Section 3.2.4 / RFC 5322 Section 3.2.3) + bad_chars = set( + safe_character_display(c) + for c in local + if not ATEXT_RE.match(c) + ) + if bad_chars: + raise EmailSyntaxError("Internationalized characters before the @-sign are not supported: " + ", ".join(sorted(bad_chars)) + ".") + + # Although the check above should always find something, fall back to this just in case. + raise EmailSyntaxError("Internationalized characters before the @-sign are not supported.") + # RFC 6532 section 3.1 also says that Unicode NFC normalization should be applied, # so we'll return the normalized local part in the return value. local = unicodedata.normalize("NFC", local) @@ -111,7 +127,7 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals bad_chars = set( safe_character_display(c) for c in local - if not (ATEXT_INTL_RE if allow_smtputf8 else ATEXT_RE).match(c) + if not ATEXT_INTL_RE.match(c) ) if bad_chars: raise EmailSyntaxError("The email address contains invalid characters before the @-sign: " + ", ".join(sorted(bad_chars)) + ".") diff --git a/tests/test_syntax.py b/tests/test_syntax.py index f60bf50..a07e4ac 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -56,6 +56,31 @@ ascii_email="!#$%&'*+-/=?^_`.{|}~@example.tld", ), ), + ( + 'jeff@臺網中心.tw', + ValidatedEmail( + local_part='jeff', + ascii_local_part='jeff', + smtputf8=False, + ascii_domain='xn--fiqq24b10vi0d.tw', + domain='臺網中心.tw', + email='jeff@臺網中心.tw', + ascii_email='jeff@xn--fiqq24b10vi0d.tw', + ), + ), + ], +) +def test_email_valid(email_input, output): + # These addresses do not require SMTPUTF8. See test_email_valid_intl_local_part + # for addresses that are valid but require SMTPUTF8. Check that it passes with + # allow_smtput8 both on and off. + assert validate_email(email_input, check_deliverability=False, allow_smtputf8=False) == output + assert validate_email(email_input, check_deliverability=False, allow_smtputf8=True) == output + + +@pytest.mark.parametrize( + 'email_input,output', + [ ( '伊昭傑@郵件.商務', ValidatedEmail( @@ -106,18 +131,6 @@ email='葉士豪@臺網中心.tw', ), ), - ( - 'jeff@臺網中心.tw', - ValidatedEmail( - local_part='jeff', - ascii_local_part='jeff', - smtputf8=False, - ascii_domain='xn--fiqq24b10vi0d.tw', - domain='臺網中心.tw', - email='jeff@臺網中心.tw', - ascii_email='jeff@xn--fiqq24b10vi0d.tw', - ), - ), ( '葉士豪@臺網中心.台灣', ValidatedEmail( @@ -200,10 +213,15 @@ ), ], ) -def test_email_valid(email_input, output): - # print(f'({email_input!r}, {validate_email(email_input, check_deliverability=False)!r}),') +def test_email_valid_intl_local_part(email_input, output): + # Check that it passes when allow_smtputf8 is True. assert validate_email(email_input, check_deliverability=False) == output + # Check that it fails when allow_smtputf8 is False. + with pytest.raises(EmailSyntaxError) as exc_info: + validate_email(email_input, allow_smtputf8=False, check_deliverability=False) + assert "Internationalized characters before the @-sign are not supported: " in str(exc_info.value) + @pytest.mark.parametrize( 'email_input,error_msg', @@ -263,7 +281,6 @@ def test_email_invalid_syntax(email_input, error_msg): # checks do not arise. with pytest.raises(EmailSyntaxError) as exc_info: validate_email(email_input) - # print(f'({email_input!r}, {str(exc_info.value)!r}),') assert str(exc_info.value) == error_msg @@ -283,7 +300,6 @@ def test_email_invalid_reserved_domain(email_input): # DNS deliverability checks do not arise. with pytest.raises(EmailSyntaxError) as exc_info: validate_email(email_input) - # print(f'({email_input!r}, {str(exc_info.value)!r}),') assert "is a special-use or reserved name" in str(exc_info.value) @@ -317,16 +333,31 @@ def test_email_unsafe_character(s, expected_error): ('email_input', 'expected_error'), [ ('white space@test', 'The email address contains invalid characters before the @-sign: SPACE.'), + ('test@white space', 'The part after the @-sign contains invalid characters: SPACE.'), ('\n@test', 'The email address contains invalid characters before the @-sign: U+000A.'), + ('test@\n', 'The part after the @-sign contains invalid characters: U+000A.'), ], ) def test_email_invalid_character(email_input, expected_error): - # Check for various unsafe characters: + # Check for various unsafe test_email_invalid_character_smtputf8: with pytest.raises(EmailSyntaxError) as exc_info: validate_email(email_input, test_environment=True) assert str(exc_info.value) == expected_error +@pytest.mark.parametrize( + ('email_input', 'expected_error'), + [ + ('λambdaツ@test', 'Internationalized characters before the @-sign are not supported: \'λ\', \'ツ\'.'), + ], +) +def test_email_invalid_character_smtputf8(email_input, expected_error): + # Check for various unsafe characters: + with pytest.raises(EmailSyntaxError) as exc_info: + validate_email(email_input, allow_smtputf8=False, test_environment=True) + assert str(exc_info.value) == expected_error + + def test_email_test_domain_name_in_test_environment(): validate_email("anything@test", test_environment=True) validate_email("anything@mycompany.test", test_environment=True) From d4c9ec2a16052a1298ba68f6b81f1081cb55f6e0 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sat, 4 Mar 2023 09:43:39 -0500 Subject: [PATCH 080/174] Some README and code comments changes --- README.md | 80 +++++++++++++++---------------------- email_validator/__main__.py | 10 +++-- 2 files changed, 38 insertions(+), 52 deletions(-) diff --git a/README.md b/README.md index 10ce952..9fc80c9 100644 --- a/README.md +++ b/README.md @@ -4,28 +4,31 @@ email-validator: Validate Email Addresses A robust email address syntax and deliverability validation library for Python 3.7+ by [Joshua Tauberer](https://joshdata.me). -This library validates that a string is of the form `name@example.com` and optionally checks that the domain name is set up to receive email. This is -the sort of validation you would want for an email-based registration form on -a website (but not necessarily for composing an email message). +This library validates that a string is of the form `name@example.com` +and optionally checks that the domain name is set up to receive email. +This is the sort of validation you would want when you are identifying +users by their email address like on a registration/login form (but not +necessarily for composing an email message, see below). Key features: * Checks that an email address has the correct syntax --- good for registration/login forms or other uses related to identifying users. -* Gives friendly error messages when validation fails (appropriate to show - to end users). -* (optionally) Checks deliverability: Does the domain name resolve? You can override the default DNS resolver. -* Supports internationalized domain names and (optionally) - internationalized local parts, but blocks unsafe characters. -* Normalizes email addresses (super important for internationalized + Rejects obsolete email address syntax that you'd find unexpected. +* Gives friendly English error messages when validation fails that you + can display to end-users. +* Checks deliverability (optional): Does the domain name resolve? + (You can override the default DNS resolver to add query caching.) +* Supports internationalized domain names and internationalized local parts. + Blocks unsafe characters for your safety. +* Normalizes email addresses (important for internationalized addresses! see below). -The library is NOT for validation of the To: line in an email message -(e.g. `My Name `), which -[flanker](https://github.com/mailgun/flanker) is more appropriate for. -And this library does NOT permit obsolete forms of email addresses, so +This library does NOT permit obsolete forms of email addresses, so if you need strict validation against the email specs exactly, use -[pyIsEmail](https://github.com/michaelherold/pyIsEmail). +[pyIsEmail](https://github.com/michaelherold/pyIsEmail) or try +[flanker](https://github.com/mailgun/flanker) if you are parsing the +To: line of an email. [![Build Status](https://github.com/JoshData/python-email-validator/actions/workflows/test_and_build.yaml/badge.svg)](https://github.com/JoshData/python-email-validator/actions/workflows/test_and_build.yaml) @@ -42,7 +45,7 @@ This package [is on PyPI](https://pypi.org/project/email-validator/), so: pip install email-validator ``` -`pip3` also works. +(You might need to use `pip3` depending on your local environment.) Quick Start ----------- @@ -82,8 +85,7 @@ Usage ### Overview The module provides a function `validate_email(email_address)` which -takes an email address (either a `str` or `bytes`, but only non-internationalized -addresses are allowed when passing a `bytes`) and: +takes an email address and: - Raises a `EmailNotValidError` with a helpful, human-readable error message explaining why the email address is not valid, or @@ -121,18 +123,19 @@ greylisting). The `validate_email` function also accepts the following keyword arguments (defaults are as shown below): +`check_deliverability=True`: If true, a DNS query is made to check that a non-null MX record is present for the domain-part of the email address (or if not, an A/AAAA record as an MX fallback can be present but in that case a reject-all SPF record must not be present). Set to `False` to skip this DNS-based check. DNS is slow and sometimes unavailable, so consider whether these checks are useful for your use case. It is recommended to pass `False` when performing validation for login pages (but not account creation pages) since re-validation of a previously validated domain in your database by querying DNS at every login is probably undesirable. You can also set `email_validator.CHECK_DELIVERABILITY` to `False` to turn this off for all calls by default. + +`dns_resolver=None`: Pass an instance of [dns.resolver.Resolver](https://dnspython.readthedocs.io/en/latest/resolver-class.html) to control the DNS resolver including setting a timeout and [a cache](https://dnspython.readthedocs.io/en/latest/resolver-caching.html). The `caching_resolver` function shown above is a helper function to construct a dns.resolver.Resolver with a [LRUCache](https://dnspython.readthedocs.io/en/latest/resolver-caching.html#dns.resolver.LRUCache). Reuse the same resolver instance across calls to `validate_email` to make use of the cache. + +`test_environment=False`: DNS-based deliverability checks are disabled and `test` and `subdomain.test` domain names are permitted (see below). You can also set `email_validator.TEST_ENVIRONMENT` to `True` to turn it on for all calls by default. + `allow_smtputf8=True`: Set to `False` to prohibit internationalized addresses that would require the [SMTPUTF8](https://tools.ietf.org/html/rfc6531) extension. You can also set `email_validator.ALLOW_SMTPUTF8` to `False` to turn it off for all calls by default. -`check_deliverability=True`: If true, a DNS query is made to check that a non-null MX record is present for the domain-part of the email address (or if not, an A/AAAA record as an MX fallback can be present but in that case a reject-all SPF record must not be present). Set to `False` to skip this DNS-based check. DNS is slow and sometimes unavailable, so consider whether these checks are useful for your use case. It is recommended to pass `False` when performing validation for login pages (but not account creation pages) since re-validation of a previously validated domain in your database by querying DNS at every login is probably undesirable. You can also set `email_validator.CHECK_DELIVERABILITY` to `False` to turn this off for all calls by default. - `allow_empty_local=False`: Set to `True` to allow an empty local part (i.e. `@example.com`), e.g. for validating Postfix aliases. -`dns_resolver=None`: Pass an instance of [dns.resolver.Resolver](https://dnspython.readthedocs.io/en/latest/resolver-class.html) to control the DNS resolver including setting a timeout and [a cache](https://dnspython.readthedocs.io/en/latest/resolver-caching.html). The `caching_resolver` function shown above is a helper function to construct a dns.resolver.Resolver with a [LRUCache](https://dnspython.readthedocs.io/en/latest/resolver-caching.html#dns.resolver.LRUCache). Reuse the same resolver instance across calls to `validate_email` to make use of the cache. - -`test_environment=False`: DNS-based deliverability checks are disabled and `test` and `subdomain.test` domain names are permitted (see below). You can also set `email_validator.TEST_ENVIRONMENT` to `True` to turn it on for all calls by default. ### DNS timeout and cache @@ -180,13 +183,8 @@ domain names are converted into a special IDNA ASCII "[Punycode](https://www.rfc form starting with `xn--`. When an email address has non-ASCII characters in its domain part, the domain part is replaced with its IDNA ASCII equivalent form in the process of mail transmission. Your mail -submission library probably does this for you transparently. Note that -most web browsers are currently in transition between IDNA 2003 (RFC -3490) and IDNA 2008 (RFC 5891) and [compliance around the web is not -very -good](http://archives.miloush.net/michkap/archive/2012/02/27/10273315.html) -in any case, so be aware that edge cases are handled differently by -different applications and libraries. This library conforms to IDNA 2008 +submission library probably does this for you transparently. ([Compliance +around the web is not very good though](http://archives.miloush.net/michkap/archive/2012/02/27/10273315.html).) This library conforms to IDNA 2008 using the [idna](https://github.com/kjd/idna) module by Kim Davies. ### Internationalized local parts @@ -305,7 +303,7 @@ ValidatedEmail( smtputf8=False) ``` -For the fictitious address `example@ツ.life`, which has an +For the fictitious but valid address `example@ツ.ⓁⒾⒻⒺ`, which has an internationalized domain but ASCII local part, the returned object is: ```python @@ -320,13 +318,9 @@ ValidatedEmail( ``` -Note that `smtputf8` is `False` even though the domain part is -internationalized because -[SMTPUTF8](https://tools.ietf.org/html/rfc6531) is only needed if the -local part of the address is internationalized (the domain part can be -converted to IDNA ASCII Punycode). Also note that the `email` and `domain` -fields provide a normalized form of the email address and domain name -(casefolding and Unicode normalization as required by IDNA 2008). +Note that the `email` and `domain` fields provide a normalized form of the +email address, domain name, and (in other cases) local part (see earlier +discussion of normalization), which you should use in your database. Calling `validate_email` with the ASCII form of the above email address, `example@xn--bdk.life`, returns the exact same information (i.e., the @@ -390,21 +384,11 @@ or likely to cause trouble: The "quoted string" form of the local part of the email address (RFC 5321 4.1.2) is not permitted. Quoted forms allow multiple @-signs, space characters, and other - troublesome conditions. The unsual [(comment) syntax](https://github.com/JoshData/python-email-validator/issues/77) + troublesome conditions. The unusual [(comment) syntax](https://github.com/JoshData/python-email-validator/issues/77) is also rejected. The "literal" form for the domain part of an email address (an IP address in brackets) is rejected. Other obsolete and deprecated syntaxes are rejected. No one uses these forms anymore. -Support for Python 2.x ----------------------- - -The last version of this library supporting Python 2.x is version 1.2.1. - -When using Python 2.x, it is required that Python be built with -UCS-4 support (see -[here](https://stackoverflow.com/questions/29109944/python-returns-length-of-2-for-single-unicode-character-string)). -Without UCS-4 support, unicode characters outside of the BMP (Basic -Multilingual Plane) will not validate correctly in internationalized addresses. Testing ------- diff --git a/email_validator/__main__.py b/email_validator/__main__.py index 4684f6a..a788ead 100644 --- a/email_validator/__main__.py +++ b/email_validator/__main__.py @@ -2,12 +2,14 @@ # # Usage: # -# python -m email_validator +# python -m email_validator test@example.org +# python -m email_validator < LIST_OF_ADDRESSES.TXT # # Provide email addresses to validate either as a command-line argument -# or in STDIN separated by newlines. No output will be given for valid -# email addresses. Validation errors will be printed for invalid email -# addresses. +# or in STDIN separated by newlines. Validation errors will be printed for +# invalid email addresses. When passing an email address on the command +# line, if the email address is valid, information about it will be printed. +# When using STDIN, no output will be given for valid email addresses. import json import sys From 1e3eeeacbb568c22cb6eb18b1a1ffcd731895089 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sat, 4 Mar 2023 11:56:12 -0500 Subject: [PATCH 081/174] Use f-strings instead of .format or '%' which are new in Python 3.6 --- email_validator/__main__.py | 2 +- email_validator/deliverability.py | 8 ++++---- email_validator/exceptions_types.py | 2 +- email_validator/syntax.py | 19 +++++++++---------- email_validator/validate_email.py | 4 ++-- tests/mocked_dns_response.py | 2 +- tests/test_deliverability.py | 6 +++--- tests/test_syntax.py | 2 +- 8 files changed, 22 insertions(+), 23 deletions(-) diff --git a/email_validator/__main__.py b/email_validator/__main__.py index a788ead..9330553 100644 --- a/email_validator/__main__.py +++ b/email_validator/__main__.py @@ -30,7 +30,7 @@ def main(dns_resolver=None): try: validate_email(email, dns_resolver=dns_resolver) except EmailNotValidError as e: - print("{} {}".format(email, e)) + print(f"{email} {e}") else: # Validate the email address passed on the command line. email = sys.argv[1] diff --git a/email_validator/deliverability.py b/email_validator/deliverability.py index 19eee65..c454e8c 100644 --- a/email_validator/deliverability.py +++ b/email_validator/deliverability.py @@ -47,7 +47,7 @@ def validate_email_deliverability(domain, domain_i18n, timeout=None, dns_resolve mtas = [(preference, exchange) for preference, exchange in mtas if exchange != ""] if len(mtas) == 0: # null MX only, if there were no MX records originally a NoAnswer exception would have occurred - raise EmailUndeliverableError("The domain name %s does not accept email." % domain_i18n) + raise EmailUndeliverableError(f"The domain name {domain_i18n} does not accept email.") deliverability_info["mx"] = mtas deliverability_info["mx_fallback_type"] = None @@ -73,7 +73,7 @@ def validate_email_deliverability(domain, domain_i18n, timeout=None, dns_resolve # this domain is not deliverable, although the domain # name has other records (otherwise NXDOMAIN would # have been raised). - raise EmailUndeliverableError("The domain name %s does not accept email." % domain_i18n) + raise EmailUndeliverableError(f"The domain name {domain_i18n} does not accept email.") # Check for a SPF (RFC 7208) reject-all record ("v=spf1 -all") which indicates # no emails are sent from this domain (similar to a Null MX record @@ -87,7 +87,7 @@ def validate_email_deliverability(domain, domain_i18n, timeout=None, dns_resolve if value.startswith(b"v=spf1 "): deliverability_info["spf"] = value.decode("ascii", errors='replace') if value == b"v=spf1 -all": - raise EmailUndeliverableError("The domain name %s does not send email." % domain_i18n) + raise EmailUndeliverableError(f"The domain name {domain_i18n} does not send email.") except dns.resolver.NoAnswer: # No TXT records means there is no SPF policy, so we cannot take any action. pass @@ -95,7 +95,7 @@ def validate_email_deliverability(domain, domain_i18n, timeout=None, dns_resolve except dns.resolver.NXDOMAIN: # The domain name does not exist --- there are no records of any sort # for the domain name. - raise EmailUndeliverableError("The domain name %s does not exist." % domain_i18n) + raise EmailUndeliverableError(f"The domain name {domain_i18n} does not exist.") except dns.resolver.NoNameservers: # All nameservers failed to answer the query. This might be a problem diff --git a/email_validator/exceptions_types.py b/email_validator/exceptions_types.py index 4fb913d..00fd856 100644 --- a/email_validator/exceptions_types.py +++ b/email_validator/exceptions_types.py @@ -65,7 +65,7 @@ def __self__(self): return self.normalized_email def __repr__(self): - return "".format(self.email) + return f"" """For backwards compatibility, some fields are also exposed through a dict-like interface. Note that some of the names changed when they became attributes.""" diff --git a/email_validator/syntax.py b/email_validator/syntax.py index a6d31ce..998ce59 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -10,10 +10,9 @@ def get_length_reason(addr, utf8=False, limit=EMAIL_MAX_LENGTH): """Helper function to return an error message related to invalid length.""" diff = len(addr) - limit - reason = "({}{} character{} too many)" prefix = "at least " if utf8 else "" suffix = "s" if diff > 1 else "" - return reason.format(prefix, diff, suffix) + return f"({prefix}{diff} character{suffix} too many)" def safe_character_display(c): @@ -23,9 +22,9 @@ def safe_character_display(c): # Construct a hex string in case the unicode name doesn't exist. if ord(c) < 0xFFFF: - h = "U+{:04x}".format(ord(c)).upper() + h = f"U+{ord(c):04x}".upper() else: - h = "U+{:08x}".format(ord(c)).upper() + h = f"U+{ord(c):08x}".upper() # Return the character name or, if it has no name, the hex string. return unicodedata.name(c, h) @@ -55,7 +54,7 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals # instead. if len(local) > LOCAL_PART_MAX_LENGTH: reason = get_length_reason(local, limit=LOCAL_PART_MAX_LENGTH) - raise EmailSyntaxError("The email address is too long before the @-sign {}.".format(reason)) + raise EmailSyntaxError(f"The email address is too long before the @-sign {reason}.") # Check the local part against the non-internationalized regular expression. # Most email addresses match this regex so it's probably fastest to check this first. @@ -231,7 +230,7 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera try: domain = idna.uts46_remap(domain, std3_rules=False, transitional=False) except idna.IDNAError as e: - raise EmailSyntaxError("The part after the @-sign contains invalid characters ({}).".format(str(e))) + raise EmailSyntaxError(f"The part after the @-sign contains invalid characters ({e}).") # The domain part is made up period-separated "labels." Each label must # have at least one character and cannot start or end with dashes, which @@ -274,7 +273,7 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera # one the user supplied. Also I'm not sure if the length check applies # to the internationalized form, the IDNA ASCII form, or even both! raise EmailSyntaxError("The email address is too long after the @-sign.") - raise EmailSyntaxError("The part after the @-sign contains invalid characters (%s)." % str(e)) + raise EmailSyntaxError(f"The part after the @-sign contains invalid characters ({e}).") # Check the syntax of the string returned by idna.encode. # It should never fail. @@ -291,14 +290,14 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera # is never reached for internationalized domains.) if len(ascii_domain) > DOMAIN_MAX_LENGTH: reason = get_length_reason(ascii_domain, limit=DOMAIN_MAX_LENGTH) - raise EmailSyntaxError("The email address is too long after the @-sign {}.".format(reason)) + raise EmailSyntaxError(f"The email address is too long after the @-sign {reason}.") # Also check the label length limit. # (RFC 1035 2.3.1) for label in ascii_domain.split("."): if len(label) > DNS_LABEL_LENGTH_LIMIT: reason = get_length_reason(label, limit=DNS_LABEL_LENGTH_LIMIT) - raise EmailSyntaxError("After the @-sign, periods cannot be separated by so many characters {}.".format(reason)) + raise EmailSyntaxError(f"After the @-sign, periods cannot be separated by so many characters {reason}.") if globally_deliverable: # All publicly deliverable addresses have domain named with at least @@ -337,7 +336,7 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera try: domain_i18n = idna.decode(ascii_domain.encode('ascii')) except idna.IDNAError as e: - raise EmailSyntaxError("The part after the @-sign is not valid IDNA ({}).".format(str(e))) + raise EmailSyntaxError(f"The part after the @-sign is not valid IDNA ({e}).") # Check for invalid characters after normalization. These # should never arise. See the similar checks above. diff --git a/email_validator/validate_email.py b/email_validator/validate_email.py index 73e5ee7..f7efce8 100644 --- a/email_validator/validate_email.py +++ b/email_validator/validate_email.py @@ -101,7 +101,7 @@ def validate_email( reason = get_length_reason(ret.email, utf8=True) else: reason = "(when converted to IDNA ASCII)" - raise EmailSyntaxError("The email address is too long {}.".format(reason)) + raise EmailSyntaxError(f"The email address is too long {reason}.") if len(ret.email.encode("utf8")) > EMAIL_MAX_LENGTH: if len(ret.email) > EMAIL_MAX_LENGTH: # If there are more than 254 characters, then the UTF-8 @@ -109,7 +109,7 @@ def validate_email( reason = get_length_reason(ret.email, utf8=True) else: reason = "(when encoded in bytes)" - raise EmailSyntaxError("The email address is too long {}.".format(reason)) + raise EmailSyntaxError(f"The email address is too long {reason}.") if check_deliverability and not test_environment: # Validate the email address's deliverability using DNS diff --git a/tests/mocked_dns_response.py b/tests/mocked_dns_response.py index 124f208..cd32796 100644 --- a/tests/mocked_dns_response.py +++ b/tests/mocked_dns_response.py @@ -103,7 +103,7 @@ def get(self, key): if (key[0], ANY, key[2]) in self.data and self.data[(key[0], ANY, key[2])] is None: raise dns.resolver.NXDOMAIN() - raise ValueError("Saved DNS data did not contain query: {}".format(key)) + raise ValueError(f"Saved DNS data did not contain query: {key}") def put(self, key, value): # Build the DNS data by saving the live query response. diff --git a/tests/test_deliverability.py b/tests/test_deliverability.py index 46ec3d8..7431668 100644 --- a/tests/test_deliverability.py +++ b/tests/test_deliverability.py @@ -23,17 +23,17 @@ def test_deliverability_found(): def test_deliverability_fails(): # No MX record. domain = 'xkxufoekjvjfjeodlfmdfjcu.com' - with pytest.raises(EmailUndeliverableError, match='The domain name {} does not exist'.format(domain)): + with pytest.raises(EmailUndeliverableError, match=f'The domain name {domain} does not exist'): validate_email_deliverability(domain, domain, dns_resolver=RESOLVER) # Null MX record. domain = 'example.com' - with pytest.raises(EmailUndeliverableError, match='The domain name {} does not accept email'.format(domain)): + with pytest.raises(EmailUndeliverableError, match=f'The domain name {domain} does not accept email'): validate_email_deliverability(domain, domain, dns_resolver=RESOLVER) # No MX record, A record fallback, reject-all SPF record. domain = 'nellis.af.mil' - with pytest.raises(EmailUndeliverableError, match='The domain name {} does not send email'.format(domain)): + with pytest.raises(EmailUndeliverableError, match=f'The domain name {domain} does not send email'): validate_email_deliverability(domain, domain, dns_resolver=RESOLVER) diff --git a/tests/test_syntax.py b/tests/test_syntax.py index a07e4ac..fcaa7ed 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -559,4 +559,4 @@ def test_pyisemail_tests(email_input, status): with pytest.raises(EmailSyntaxError): validate_email(email_input, test_environment=True) else: - raise ValueError("status {} is not recognized".format(status)) + raise ValueError(f"status {status} is not recognized") From d390c6b9a19853a5dc355e48bc43d7c4bc165d9d Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sat, 4 Mar 2023 09:51:00 -0500 Subject: [PATCH 082/174] 2.0.0-pre1 release --- CHANGELOG.md | 12 ++++++++---- setup.cfg | 2 +- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d4366cd..cbccd81 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,12 @@ -In Development --------------- +2.0.0-dev1 +---------- -* Python versions through 3.6 and dnspython 1.x are no longer supported. Python 3.7+ with dnspython 2.x are now required. -* The dnspython package is no longer required if DNS checks are not used. +This is a pre-release for version 2.0.0. + +There are no significant changes to which email addresses are considered valid/invalid, but there are many changes in error messages and internal improvements to the library, and Python 3.7+ is now required. + +* Python 2.x and 3.x versions through 3.6, and dnspython 1.x, are no longer supported. Python 3.7+ with dnspython 2.x are now required. +* The dnspython package is no longer required if DNS checks are not used, although it will install automatically. * NoNameservers and NXDOMAIN DNS errors are now handled differently: NoNameservers no longer fails validation, and NXDOMAIN now skips checking for an A/AAAA fallback and goes straight to failing validation. * Some syntax error messages have changed because they are now checked explicitly rather than as a part of other checks. * Some other error messages have changed to not repeat the email address in the error message. diff --git a/setup.cfg b/setup.cfg index 6b62044..73dcb32 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = email_validator -version = 1.3.1 +version = 2.0.0-dev1 description = A robust email address syntax and deliverability validation library. long_description = file: README.md long_description_content_type = text/markdown From 7f838b823ecc810f2a980a6500f918295c9544f3 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sat, 4 Mar 2023 13:37:40 -0500 Subject: [PATCH 083/174] Remove inoperable code ValidatedEmail.__self__ added in 8243bd23 This was probably a typo for __str__ and it accessed a since-removed field. --- email_validator/exceptions_types.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/email_validator/exceptions_types.py b/email_validator/exceptions_types.py index 00fd856..366e67e 100644 --- a/email_validator/exceptions_types.py +++ b/email_validator/exceptions_types.py @@ -60,10 +60,6 @@ def __init__(self, **kwargs): for k, v in kwargs.items(): setattr(self, k, v) - """As a convenience, str(...) on instances of this class return the normalized address.""" - def __self__(self): - return self.normalized_email - def __repr__(self): return f"" From 44aa552218b068b489c0a33a783942555092eccd Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Mon, 13 Mar 2023 18:01:23 -0400 Subject: [PATCH 084/174] Add type annotations (#99) This is my first time using type hints so hopefully it's right. * Added type annotations to the exported methods, some of the main internal methods, and the ValidatedEmail class. * ValidatedEmail's ascii_email, ascii_local_part, ascii_domain, mx, and mx_fallback_type attributes now no longer are set by the class's __init__ method, although they are always filled in by validate_email. * Make the main module's exports explicit to solve implicit re-export lint warning/error. * Added py.typed. * Added `mypy` to tests. * Made a -dev4 release to test. Closes #98. --- .github/workflows/test_and_build.yaml | 3 +++ .travis.yml | 1 + CHANGELOG.md | 5 ++-- Makefile | 6 ++++- email_validator/__init__.py | 11 ++++++-- email_validator/deliverability.py | 12 +++++---- email_validator/exceptions_types.py | 37 +++++++++++++++------------ email_validator/syntax.py | 2 +- email_validator/validate_email.py | 26 +++++++++++-------- setup.cfg | 5 +++- test_requirements.txt | 5 +++- 11 files changed, 73 insertions(+), 40 deletions(-) diff --git a/.github/workflows/test_and_build.yaml b/.github/workflows/test_and_build.yaml index 40314b5..e80acd6 100644 --- a/.github/workflows/test_and_build.yaml +++ b/.github/workflows/test_and_build.yaml @@ -23,6 +23,9 @@ jobs: - name: Lint with flake8 run: | make lint + - name: Check typing with mypy + run: | + make typing - name: Test with pytest run: | make test diff --git a/.travis.yml b/.travis.yml index ec189a2..3a283dc 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,6 +15,7 @@ install: - make install script: +- make typing - make lint - make test diff --git a/CHANGELOG.md b/CHANGELOG.md index cbccd81..bcd013e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,9 @@ -2.0.0-dev1 +2.0.0-dev4 ---------- This is a pre-release for version 2.0.0. -There are no significant changes to which email addresses are considered valid/invalid, but there are many changes in error messages and internal improvements to the library, and Python 3.7+ is now required. +There are no significant changes to which email addresses are considered valid/invalid, but there are many changes in error messages and internal improvements to the library including the addition of type annotations, and Python 3.7+ is now required. * Python 2.x and 3.x versions through 3.6, and dnspython 1.x, are no longer supported. Python 3.7+ with dnspython 2.x are now required. * The dnspython package is no longer required if DNS checks are not used, although it will install automatically. @@ -12,6 +12,7 @@ There are no significant changes to which email addresses are considered valid/i * Some other error messages have changed to not repeat the email address in the error message. * The library has been reorganized internally into smaller modules. * The tests have been reorganized and expanded. Deliverability tests now mostly use captured DNS responses so they can be run off-line. +* Type annotations have been added to the exported methods and the ValidatedEmail class and some internal methods. Version 1.3.1 (January 21, 2023) -------------------------------- diff --git a/Makefile b/Makefile index 9226591..7233a19 100644 --- a/Makefile +++ b/Makefile @@ -11,6 +11,10 @@ lint: #python setup.py check -rms flake8 --ignore=E501,E126,W503 email_validator tests +.PHONY: typing +typing: + mypy email_validator/*.py tests/*.py + .PHONY: test test: PYTHONPATH=.:$PYTHONPATH pytest --cov=email_validator @@ -21,7 +25,7 @@ testcov: test @coverage html .PHONY: all -all: testcov lint +all: typing testcov lint .PHONY: clean clean: diff --git a/email_validator/__init__.py b/email_validator/__init__.py index 94ebcb6..9d5373e 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -1,8 +1,15 @@ # -*- coding: utf-8 -*- # Export the main method, helper methods, and the public data types. -from .exceptions_types import * # noqa: F401,F403 -from .validate_email import validate_email # noqa: F401 +from .exceptions_types import ValidatedEmail, EmailNotValidError, \ + EmailSyntaxError, EmailUndeliverableError +from .validate_email import validate_email + + +__all__ = ["validate_email", + "ValidatedEmail", "EmailNotValidError", + "EmailSyntaxError", "EmailUndeliverableError", + "caching_resolver"] def caching_resolver(*args, **kwargs): diff --git a/email_validator/deliverability.py b/email_validator/deliverability.py index c454e8c..f787bb3 100644 --- a/email_validator/deliverability.py +++ b/email_validator/deliverability.py @@ -1,20 +1,22 @@ +from typing import Optional, Any, Dict + from .exceptions_types import EmailUndeliverableError import dns.resolver import dns.exception -def caching_resolver(*, timeout=None, cache=None): +def caching_resolver(*, timeout: Optional[int] = None, cache=None): if timeout is None: from . import DEFAULT_TIMEOUT timeout = DEFAULT_TIMEOUT resolver = dns.resolver.Resolver() - resolver.cache = cache or dns.resolver.LRUCache() - resolver.lifetime = timeout # timeout, in seconds + resolver.cache = cache or dns.resolver.LRUCache() # type: ignore + resolver.lifetime = timeout # type: ignore # timeout, in seconds return resolver -def validate_email_deliverability(domain, domain_i18n, timeout=None, dns_resolver=None): +def validate_email_deliverability(domain: str, domain_i18n: str, timeout: Optional[int] = None, dns_resolver=None): # Check that the domain resolves to an MX record. If there is no MX record, # try an A or AAAA record which is a deprecated fallback for deliverability. # Raises an EmailUndeliverableError on failure. On success, returns a dict @@ -30,7 +32,7 @@ def validate_email_deliverability(domain, domain_i18n, timeout=None, dns_resolve dns_resolver = dns.resolver.get_default_resolver() dns_resolver.lifetime = timeout - deliverability_info = {} + deliverability_info: Dict[str, Any] = {} try: try: diff --git a/email_validator/exceptions_types.py b/email_validator/exceptions_types.py index 366e67e..176d50e 100644 --- a/email_validator/exceptions_types.py +++ b/email_validator/exceptions_types.py @@ -1,3 +1,6 @@ +from typing import Optional + + class EmailNotValidError(ValueError): """Parent class of all exceptions raised by this module.""" pass @@ -18,42 +21,42 @@ class ValidatedEmail(object): and other information.""" """The email address that was passed to validate_email. (If passed as bytes, this will be a string.)""" - original_email = None + original_email: str """The normalized email address, which should always be used in preferance to the original address. The normalized address converts an IDNA ASCII domain name to Unicode, if possible, and performs Unicode normalization on the local part and on the domain (if originally Unicode). It is the concatenation of the local_part and domain attributes, separated by an @-sign.""" - email = None + email: str """The local part of the email address after Unicode normalization.""" - local_part = None + local_part: str """The domain part of the email address after Unicode normalization or conversion to Unicode from IDNA ascii.""" - domain = None + domain: str """If not None, a form of the email address that uses 7-bit ASCII characters only.""" - ascii_email = None + ascii_email: Optional[str] """If not None, the local part of the email address using 7-bit ASCII characters only.""" - ascii_local_part = None + ascii_local_part: Optional[str] - """If not None, a form of the domain name that uses 7-bit ASCII characters only.""" - ascii_domain = None + """A form of the domain name that uses 7-bit ASCII characters only.""" + ascii_domain: str """If True, the SMTPUTF8 feature of your mail relay will be required to transmit messages to this address. This flag is True just when ascii_local_part is missing. Otherwise it is False.""" - smtputf8 = None + smtputf8: bool """If a deliverability check is performed and if it succeeds, a list of (priority, domain) tuples of MX records specified in the DNS for the domain.""" - mx = None + mx: list """If no MX records are actually specified in DNS and instead are inferred, through an obsolete mechanism, from A or AAAA records, the value is the type of DNS record used instead (`A` or `AAAA`).""" - mx_fallback_type = None + mx_fallback_type: str """Tests use this constructor.""" def __init__(self, **kwargs): @@ -92,13 +95,13 @@ def __eq__(self, other): self.email == other.email and self.local_part == other.local_part and self.domain == other.domain - and self.ascii_email == other.ascii_email - and self.ascii_local_part == other.ascii_local_part - and self.ascii_domain == other.ascii_domain + and getattr(self, 'ascii_email', None) == getattr(other, 'ascii_email', None) + and getattr(self, 'ascii_local_part', None) == getattr(other, 'ascii_local_part', None) + and getattr(self, 'ascii_domain', None) == getattr(other, 'ascii_domain', None) and self.smtputf8 == other.smtputf8 - and repr(sorted(self.mx) if self.mx else self.mx) - == repr(sorted(other.mx) if other.mx else other.mx) - and self.mx_fallback_type == other.mx_fallback_type + and repr(sorted(self.mx) if getattr(self, 'mx', None) else None) + == repr(sorted(other.mx) if getattr(other, 'mx', None) else None) + and getattr(self, 'mx_fallback_type', None) == getattr(other, 'mx_fallback_type', None) ) """This helps producing the README.""" diff --git a/email_validator/syntax.py b/email_validator/syntax.py index 998ce59..1b2e6e2 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -30,7 +30,7 @@ def safe_character_display(c): return unicodedata.name(c, h) -def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=False): +def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_empty_local: bool = False): """Validates the syntax of the local part of an email address.""" if len(local) == 0: diff --git a/email_validator/validate_email.py b/email_validator/validate_email.py index f7efce8..8e05498 100644 --- a/email_validator/validate_email.py +++ b/email_validator/validate_email.py @@ -1,20 +1,22 @@ +from typing import Optional, Union + from .exceptions_types import EmailSyntaxError, ValidatedEmail from .syntax import validate_email_local_part, validate_email_domain_part, get_length_reason from .rfc_constants import EMAIL_MAX_LENGTH def validate_email( - email, + email: Union[str, bytes], # /, # not supported in Python 3.6, 3.7 *, - allow_smtputf8=None, - allow_empty_local=False, - check_deliverability=None, - test_environment=None, - globally_deliverable=None, - timeout=None, - dns_resolver=None -): + allow_smtputf8: Optional[bool] = None, + allow_empty_local: bool = False, + check_deliverability: Optional[bool] = None, + test_environment: Optional[bool] = None, + globally_deliverable: Optional[bool] = None, + timeout: Optional[int] = None, + dns_resolver: Optional[object] = None +) -> ValidatedEmail: """ Validates an email address, raising an EmailNotValidError if the address is not valid or returning a dict of information when the address is valid. The email argument can be a str or a bytes instance, @@ -70,7 +72,11 @@ def validate_email( # If the email address has an ASCII form, add it. if not ret.smtputf8: - ret.ascii_email = ret.ascii_local_part + "@" + ret.ascii_domain + if not ret.ascii_domain: + raise Exception("Missing ASCII domain.") + ret.ascii_email = (ret.ascii_local_part or "") + "@" + ret.ascii_domain + else: + ret.ascii_email = None # If the email address has an ASCII representation, then we assume it may be # transmitted in ASCII (we can't assume SMTPUTF8 will be used on all hops to diff --git a/setup.cfg b/setup.cfg index 73dcb32..6f79802 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = email_validator -version = 2.0.0-dev1 +version = 2.0.0-dev4 description = A robust email address syntax and deliverability validation library. long_description = file: README.md long_description_content_type = text/markdown @@ -29,6 +29,9 @@ install_requires = idna>=2.0.0 python_requires = >=3.7 +[options.package_data] +* = py.typed + [options.entry_points] console_scripts = email_validator=email_validator:main diff --git a/test_requirements.txt b/test_requirements.txt index e623d5c..5f11247 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -1,7 +1,7 @@ # This file was generated by running: # sudo docker run --rm -it --network=host python:3.7-slim /bin/bash # pip install dnspython idna # from setup.cfg -# pip install pytest pytest-cov coverage flake8 +# pip install pytest pytest-cov coverage flake8 mypy # pip freeze # (Some packages' latest versions may not be compatible with # the earliest Python version we support, and some exception @@ -16,6 +16,8 @@ idna==3.4 importlib-metadata==4.2.0 iniconfig==2.0.0 mccabe==0.7.0 +mypy==1.0.1 +mypy-extensions==1.0.0 packaging==23.0 pluggy==1.0.0 pycodestyle==2.9.1 @@ -23,5 +25,6 @@ pyflakes==2.5.0 pytest==7.2.1 pytest-cov==4.0.0 tomli==2.0.1 +typed-ast==1.5.4 typing_extensions==4.5.0 zipp==3.15.0 From 82413f527ac367ddf038eed92f564bf15953f343 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Tue, 14 Mar 2023 07:51:01 -0400 Subject: [PATCH 085/174] Remove universal wheel distribution Since universal wheels are only to support Python 2 and we no longer support Python 2, we can make the distribution a regular wheel again. Reverts #17 after valiantly supporting Python 2 for several years. Fixes #100. --- README.md | 3 --- setup.cfg | 3 --- 2 files changed, 6 deletions(-) diff --git a/README.md b/README.md index 9fc80c9..6ab5e04 100644 --- a/README.md +++ b/README.md @@ -421,6 +421,3 @@ To release: git tag v$(grep version setup.cfg | sed "s/.*= //") git push --tags ``` - -Notes: The wheel is specified as universal in the file `setup.cfg` by the `universal = 1` key in the -`[bdist_wheel]` section. diff --git a/setup.cfg b/setup.cfg index 6f79802..60ff626 100644 --- a/setup.cfg +++ b/setup.cfg @@ -36,9 +36,6 @@ python_requires = >=3.7 console_scripts = email_validator=email_validator:main -[bdist_wheel] -universal = 1 - [flake8] max-line-length = 120 From db0ebfe1d030ca8518366027c1f6084e969ab9f2 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Tue, 14 Mar 2023 08:06:33 -0400 Subject: [PATCH 086/174] Use Python 'build' package to build the distribution and update README instead of setuptools --- README.md | 9 +++++---- release_to_pypi.sh | 5 ++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 6ab5e04..e40db48 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,7 @@ Key features: Blocks unsafe characters for your safety. * Normalizes email addresses (important for internationalized addresses! see below). +* Python type annotations are used. This library does NOT permit obsolete forms of email addresses, so if you need strict validation against the email specs exactly, use @@ -411,13 +412,13 @@ To release: * Update CHANGELOG.md. * Update the version number in setup.cfg. -* Make & push a commit with the new version number. -* Make & push a tag (`git tag v... && git push --tags`). +* Make & push a commit with the new version number and make sure tests pass. +* Make & push a tag (see command below). * Make a release at https://github.com/JoshData/python-email-validator/releases/new. -* Follow the steps below to publish source and a universal wheel to pypi. +* Publish a source and wheel distribution to pypi (see command below). ```sh -./release_to_pypi.sh git tag v$(grep version setup.cfg | sed "s/.*= //") git push --tags +./release_to_pypi.sh ``` diff --git a/release_to_pypi.sh b/release_to_pypi.sh index d8d5e05..efef293 100755 --- a/release_to_pypi.sh +++ b/release_to_pypi.sh @@ -1,6 +1,5 @@ #!/bin/sh -pip3 install --upgrade twine +pip3 install --upgrade build twine rm -rf dist -python3 setup.py sdist -python3 setup.py bdist_wheel +python3 -m build twine upload -u __token__ dist/* # username: __token__ password: pypi API token From ca7640b85b21a3f3ef40e6bfb025876b28e356f8 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sat, 8 Apr 2023 20:51:31 -0400 Subject: [PATCH 087/174] Fix some typos in comments, improve some comments, merge some redundant tests --- email_validator/rfc_constants.py | 2 +- email_validator/syntax.py | 4 ++-- tests/test_syntax.py | 30 +++++++++++------------------- 3 files changed, 14 insertions(+), 22 deletions(-) diff --git a/email_validator/rfc_constants.py b/email_validator/rfc_constants.py index 802f773..9584970 100644 --- a/email_validator/rfc_constants.py +++ b/email_validator/rfc_constants.py @@ -13,7 +13,7 @@ # RFC 6531 section 3.3 extends the allowed characters in internationalized # addresses to also include three specific ranges of UTF8 defined in -# RFC3629 section 4, which appear to be the Unicode code points from +# RFC 3629 section 4, which appear to be the Unicode code points from # U+0080 to U+10FFFF. ATEXT_INTL = ATEXT + u"\u0080-\U0010FFFF" ATEXT_INTL_RE = re.compile('[.' + ATEXT_INTL + ']') # ATEXT_INTL plus dots diff --git a/email_validator/syntax.py b/email_validator/syntax.py index 1b2e6e2..5449cf5 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -45,7 +45,7 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp "smtputf8": False, } - # Check the length of the local part by couting characters. + # Check the length of the local part by counting characters. # (RFC 5321 4.5.3.1.1) # We're checking the number of characters here. If the local part # is ASCII-only, then that's the same as bytes (octets). If it's @@ -97,7 +97,7 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp # so we'll return the normalized local part in the return value. local = unicodedata.normalize("NFC", local) - # Check for unsafe characters. + # Check that the local part is a valid, safe, and sensible Unicode string. # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked # by DOT_ATOM_TEXT_INTL. Other characters may be permitted by the email specs, but # they may not be valid, safe, or sensible Unicode strings. diff --git a/tests/test_syntax.py b/tests/test_syntax.py index fcaa7ed..ef85ead 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -256,9 +256,16 @@ def test_email_valid_intl_local_part(email_input, output): "The part after the @-sign contains invalid characters (Codepoint U+2488 not allowed " "at position 1 in '⒈wouldbeinvalid.com')."), ('@example.com', 'There must be something before the @-sign.'), + ('white space@test', 'The email address contains invalid characters before the @-sign: SPACE.'), + ('test@white space', 'The part after the @-sign contains invalid characters: SPACE.'), ('\nmy@example.com', 'The email address contains invalid characters before the @-sign: U+000A.'), ('m\ny@example.com', 'The email address contains invalid characters before the @-sign: U+000A.'), ('my\n@example.com', 'The email address contains invalid characters before the @-sign: U+000A.'), +<<<<<<< HEAD +======= + ('test@\n', 'The part after the @-sign contains invalid characters: U+000A.'), + ('bad"quotes"@example.com', 'The email address contains invalid characters before the @-sign: \'"\'.'), +>>>>>>> 6146614 (fixup) ('11111111112222222222333333333344444444445555555555666666666677777@example.com', 'The email address is too long before the @-sign (1 character too many).'), ('111111111122222222223333333333444444444455555555556666666666777777@example.com', 'The email address is too long before the @-sign (2 characters too many).'), ('me@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.111111111122222222223333333333444444444455555555556.com', 'The email address is too long (4 characters too many).'), @@ -318,7 +325,8 @@ def test_email_invalid_reserved_domain(email_input): ], ) def test_email_unsafe_character(s, expected_error): - # Check for various unsafe characters: + # Check for various unsafe characters that are permitted by the email + # specs but should be disallowed for being unsafe or not sensible Unicode. with pytest.raises(EmailSyntaxError) as exc_info: validate_email(s + "@test", test_environment=True) @@ -329,22 +337,6 @@ def test_email_unsafe_character(s, expected_error): assert "The email address contains unsafe characters" in str(exc_info.value) -@pytest.mark.parametrize( - ('email_input', 'expected_error'), - [ - ('white space@test', 'The email address contains invalid characters before the @-sign: SPACE.'), - ('test@white space', 'The part after the @-sign contains invalid characters: SPACE.'), - ('\n@test', 'The email address contains invalid characters before the @-sign: U+000A.'), - ('test@\n', 'The part after the @-sign contains invalid characters: U+000A.'), - ], -) -def test_email_invalid_character(email_input, expected_error): - # Check for various unsafe test_email_invalid_character_smtputf8: - with pytest.raises(EmailSyntaxError) as exc_info: - validate_email(email_input, test_environment=True) - assert str(exc_info.value) == expected_error - - @pytest.mark.parametrize( ('email_input', 'expected_error'), [ @@ -352,7 +344,7 @@ def test_email_invalid_character(email_input, expected_error): ], ) def test_email_invalid_character_smtputf8(email_input, expected_error): - # Check for various unsafe characters: + # Check that internationalized characters are rejected if allow_smtputf8=False. with pytest.raises(EmailSyntaxError) as exc_info: validate_email(email_input, allow_smtputf8=False, test_environment=True) assert str(exc_info.value) == expected_error @@ -545,7 +537,7 @@ def test_pyisemail_tests(email_input, status): elif "_ERR_" in status or "_TOOLONG" in status \ or "_CFWS_FWS" in status or "_CFWS_COMMENT" in status \ or "_IPV6" in status or status == "ISEMAIL_RFC5322_DOMAIN": - # Invalid syntax, extranous whitespace, and "(comments)" should be rejected. + # Invalid syntax, extraneous whitespace, and "(comments)" should be rejected. # The _IPV6_ diagnoses appear to represent syntactically invalid domain literals. # The ISEMAIL_RFC5322_DOMAIN diagnosis appears to be a syntactically invalid domain. with pytest.raises(EmailSyntaxError): From 8fdbaba8e67435ff0820f0cfe0e2876d53f1b18b Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Fri, 7 Apr 2023 10:58:21 -0400 Subject: [PATCH 088/174] Add some tests for invalid addresses with quotes in preparation for parsing them --- email_validator/syntax.py | 2 ++ tests/test_syntax.py | 23 +++++++++++++++++++---- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/email_validator/syntax.py b/email_validator/syntax.py index 5449cf5..1bd7f3c 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -17,6 +17,8 @@ def get_length_reason(addr, utf8=False, limit=EMAIL_MAX_LENGTH): def safe_character_display(c): # Return safely displayable characters in quotes. + if c == '\\': + return f"\"{c}\"" # can't use repr because it escapes it if unicodedata.category(c)[0] in ("L", "N", "P", "S"): return repr(c) diff --git a/tests/test_syntax.py b/tests/test_syntax.py index ef85ead..acef671 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -223,6 +223,24 @@ def test_email_valid_intl_local_part(email_input, output): assert "Internationalized characters before the @-sign are not supported: " in str(exc_info.value) +@pytest.mark.parametrize( + 'email_input,error_msg', + [ + ('"unnecessarily.quoted.local.part"@example.com', 'The email address contains invalid characters before the @-sign: \'"\'.'), + ('"quoted..local.part"@example.com', 'The email address contains invalid characters before the @-sign: \'"\'.'), + ('"quoted.with.at@"@example.com', 'The email address is not valid. It must have exactly one @-sign.'), + ('"quoted with space"@example.com', 'The email address contains invalid characters before the @-sign: \'\"\', SPACE.'), + ('"quoted.with.dquote\\""@example.com', 'The email address contains invalid characters before the @-sign: "\\", \'"\'.'), + ('"unnecessarily.quoted.with.unicode.λ"@example.com', 'The email address contains invalid characters before the @-sign: \'"\'.'), + ('"quoted.with..unicode.λ"@example.com', 'The email address contains invalid characters before the @-sign: \'"\'.'), + ('"quoted.with.extraneous.\\escape"@example.com', 'The email address contains invalid characters before the @-sign: "\\", \'"\'.'), + ]) +def test_email_valid_only_if_quoted_local_part(email_input, error_msg): + with pytest.raises(EmailSyntaxError) as exc_info: + validate_email(email_input) + assert str(exc_info.value) == error_msg + + @pytest.mark.parametrize( 'email_input,error_msg', [ @@ -261,11 +279,8 @@ def test_email_valid_intl_local_part(email_input, output): ('\nmy@example.com', 'The email address contains invalid characters before the @-sign: U+000A.'), ('m\ny@example.com', 'The email address contains invalid characters before the @-sign: U+000A.'), ('my\n@example.com', 'The email address contains invalid characters before the @-sign: U+000A.'), -<<<<<<< HEAD -======= ('test@\n', 'The part after the @-sign contains invalid characters: U+000A.'), ('bad"quotes"@example.com', 'The email address contains invalid characters before the @-sign: \'"\'.'), ->>>>>>> 6146614 (fixup) ('11111111112222222222333333333344444444445555555555666666666677777@example.com', 'The email address is too long before the @-sign (1 character too many).'), ('111111111122222222223333333333444444444455555555556666666666777777@example.com', 'The email address is too long before the @-sign (2 characters too many).'), ('me@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.111111111122222222223333333333444444444455555555556.com', 'The email address is too long (4 characters too many).'), @@ -343,7 +358,7 @@ def test_email_unsafe_character(s, expected_error): ('λambdaツ@test', 'Internationalized characters before the @-sign are not supported: \'λ\', \'ツ\'.'), ], ) -def test_email_invalid_character_smtputf8(email_input, expected_error): +def test_email_invalid_character_smtputf8_off(email_input, expected_error): # Check that internationalized characters are rejected if allow_smtputf8=False. with pytest.raises(EmailSyntaxError) as exc_info: validate_email(email_input, allow_smtputf8=False, test_environment=True) From 1b9e867758d666885d097e5b77c754aecca70703 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Fri, 7 Apr 2023 23:00:35 -0400 Subject: [PATCH 089/174] Parse quoted-string local parts but by default keep them disallowed with better exception messages People have opened issues several times about quoted local parts being incorrectly rejected. We can give a better error when it happens to head-off questions about it by parsing them so that we know when they occur. * Detect when a quoted-string local part might be present when splitting the address into a local part and domain part when the address has quoted @-signs in the local part rather than giving an error message about multiple @-signs. * Remove the surrounding quotes and un-escape the string before checking the syntax of the local part. Return the un-quoted and un-escaped string as the normalized local_part in the returned ValidatedEmail object if it's valid as an unquoted local part. * Check for invalid characters in the quoted-string (per the spec and our additional Unicode character checks) and raise exceptions. * Add a new option to accept quoted-string local parts which is off by default. When accepting them, apply Unicode normalization as per dot-atom internationalized addresses and apply minimal backslash escaping. * Update tests. See #54, #92. --- CHANGELOG.md | 1 + README.md | 32 +++++---- email_validator/__init__.py | 3 +- email_validator/rfc_constants.py | 9 +++ email_validator/syntax.py | 115 ++++++++++++++++++++++++------ email_validator/validate_email.py | 45 +++++++++--- tests/test_syntax.py | 54 ++++++++++---- 7 files changed, 201 insertions(+), 58 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bcd013e..ff57248 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ There are no significant changes to which email addresses are considered valid/i * The dnspython package is no longer required if DNS checks are not used, although it will install automatically. * NoNameservers and NXDOMAIN DNS errors are now handled differently: NoNameservers no longer fails validation, and NXDOMAIN now skips checking for an A/AAAA fallback and goes straight to failing validation. * Some syntax error messages have changed because they are now checked explicitly rather than as a part of other checks. +* The quoted-string local part syntax (e.g. multiple @-signs, spaces, etc. if surrounded by quotes) is now parsed but not considered valid by default. Better error messages are now given for quoted-string syntax since it can be confusing for a technically valid address to be rejected, and a new allow_quoted_local option is added to allow these addresses if you really need them. * Some other error messages have changed to not repeat the email address in the error message. * The library has been reorganized internally into smaller modules. * The tests have been reorganized and expanded. Deliverability tests now mostly use captured DNS responses so they can be run off-line. diff --git a/README.md b/README.md index e40db48..fd88934 100644 --- a/README.md +++ b/README.md @@ -19,17 +19,18 @@ Key features: can display to end-users. * Checks deliverability (optional): Does the domain name resolve? (You can override the default DNS resolver to add query caching.) -* Supports internationalized domain names and internationalized local parts. +* Supports internationalized domain names and internationalized local parts, + and with an option deprecated quoted-string local parts. Blocks unsafe characters for your safety. * Normalizes email addresses (important for internationalized - addresses! see below). + and quoted-string addresses! see below). * Python type annotations are used. -This library does NOT permit obsolete forms of email addresses, so -if you need strict validation against the email specs exactly, use +This library does NOT permit obsolete forms of email addresses by default, +so if you need strict validation against the email specs exactly, use [pyIsEmail](https://github.com/michaelherold/pyIsEmail) or try [flanker](https://github.com/mailgun/flanker) if you are parsing the -To: line of an email. +"To:" line of an email. [![Build Status](https://github.com/JoshData/python-email-validator/actions/workflows/test_and_build.yaml/badge.svg)](https://github.com/JoshData/python-email-validator/actions/workflows/test_and_build.yaml) @@ -103,8 +104,8 @@ But when an email address is valid, an object is returned containing a normalized form of the email address (which you should use!) and other information. -The validator doesn't permit obsoleted forms of email addresses that no -one uses anymore even though they are still valid and deliverable, since +The validator doesn't, by default, permit obsoleted forms of email addresses +that no one uses anymore even though they are still valid and deliverable, since they will probably give you grief if you're using email for login. (See later in the document about that.) @@ -134,6 +135,8 @@ The `validate_email` function also accepts the following keyword arguments require the [SMTPUTF8](https://tools.ietf.org/html/rfc6531) extension. You can also set `email_validator.ALLOW_SMTPUTF8` to `False` to turn it off for all calls by default. +`allow_quoted_local=False`: Set to `True` to allow obscure and potentially problematic email addresses in which the part of the address before the @-sign contains spaces, @-signs, or other surprising characters when the local part is surrounded in quotes (so-called quoted-string local parts). In the object returned by `validate_email`, the normalized local part removes any unnecessary backslash-escaping and even removes the surrounding quotes if the address would be valid without them. You can also set `email_validator.ALLOW_QUOTED_LOCAL` to `True` to turn this on for all calls by default. + `allow_empty_local=False`: Set to `True` to allow an empty local part (i.e. `@example.com`), e.g. for validating Postfix aliases. @@ -288,6 +291,11 @@ and conversion from Punycode to Unicode characters. 3.1](https://tools.ietf.org/html/rfc6532#section-3.1) and [RFC 5895 (IDNA 2008) section 2](http://www.ietf.org/rfc/rfc5895.txt).) +Normalization is also applied to quoted-string local parts if you have +allowed them by the `allow_quoted_local` option. Unnecessary backslash +escaping is removed and even the surrounding quotes are removed if they +are unnecessary. + Examples -------- @@ -355,9 +363,9 @@ are: | Field | Value | | -----:|-------| -| `email` | The normalized form of the email address that you should put in your database. This merely combines the `local_part` and `domain` fields (see below). | +| `email` | The normalized form of the email address that you should put in your database. This combines the `local_part` and `domain` fields (see below). | | `ascii_email` | If set, an ASCII-only form of the email address by replacing the domain part with [IDNA](https://tools.ietf.org/html/rfc5891) [Punycode](https://www.rfc-editor.org/rfc/rfc3492.txt). This field will be present when an ASCII-only form of the email address exists (including if the email address is already ASCII). If the local part of the email address contains internationalized characters, `ascii_email` will be `None`. If set, it merely combines `ascii_local_part` and `ascii_domain`. | -| `local_part` | The local part of the given email address (before the @-sign) with Unicode NFC normalization applied. | +| `local_part` | The normalized local part of the given email address (before the @-sign). Normalization includes Unicode NFC normalization and removing unnecessary quoted-string quotes and backslashes. If `allow_quoted_local` is True and the surrounding quotes are necessary, the quotes _will_ be present in this field. | | `ascii_local_part` | If set, the local part, which is composed of ASCII characters only. | | `domain` | The canonical internationalized Unicode form of the domain part of the email address. If the returned string contains non-ASCII characters, either the [SMTPUTF8](https://tools.ietf.org/html/rfc6531) feature of your mail relay will be required to transmit the message or else the email address's domain part must be converted to IDNA ASCII first: Use `ascii_domain` field instead. | | `ascii_domain` | The [IDNA](https://tools.ietf.org/html/rfc5891) [Punycode](https://www.rfc-editor.org/rfc/rfc3492.txt)-encoded form of the domain part of the given email address, as it would be transmitted on the wire. | @@ -383,9 +391,9 @@ or likely to cause trouble: (except see the `test_environment` parameter above). * Obsolete email syntaxes are rejected: The "quoted string" form of the local part of the email address (RFC - 5321 4.1.2) is not permitted. - Quoted forms allow multiple @-signs, space characters, and other - troublesome conditions. The unusual [(comment) syntax](https://github.com/JoshData/python-email-validator/issues/77) + 5321 4.1.2) is not permitted unless `allow_quoted_local=True` is given + (see above). + The unusual ["(comment)" syntax](https://github.com/JoshData/python-email-validator/issues/77) is also rejected. The "literal" form for the domain part of an email address (an IP address in brackets) is rejected. Other obsolete and deprecated syntaxes are rejected. No one uses these forms anymore. diff --git a/email_validator/__init__.py b/email_validator/__init__.py index 9d5373e..aa0dc7c 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -25,9 +25,10 @@ def caching_resolver(*args, **kwargs): # Default values for keyword arguments. ALLOW_SMTPUTF8 = True +ALLOW_QUOTED_LOCAL = False +GLOBALLY_DELIVERABLE = True CHECK_DELIVERABILITY = True TEST_ENVIRONMENT = False -GLOBALLY_DELIVERABLE = True DEFAULT_TIMEOUT = 15 # secs # IANA Special Use Domain Names diff --git a/email_validator/rfc_constants.py b/email_validator/rfc_constants.py index 9584970..cfbde12 100644 --- a/email_validator/rfc_constants.py +++ b/email_validator/rfc_constants.py @@ -27,6 +27,15 @@ DOT_ATOM_TEXT_HOSTNAME = re.compile(HOSTNAME_LABEL + r'(?:\.' + HOSTNAME_LABEL + r')*\Z') DOMAIN_NAME_REGEX = re.compile(r"[A-Za-z]\Z") # all TLDs currently end with a letter +# Quoted-string local part (RFC 5321 4.1.2, internationalized by RFC 6531 section 3.3) +# The permitted characters in a quoted string are the characters in the range +# 32-126, except that quotes and (literal) backslashes can only appear when escaped +# by a backslash. When internationalized, UTF8 strings are also permitted except +# the ASCII characters that are not previously permitted (see above). +# QUOTED_LOCAL_PART_ADDR = re.compile(r"^\"((?:[\u0020-\u0021\u0023-\u005B\u005D-\u007E]|\\[\u0020-\u007E])*)\"@(.*)") +QUOTED_LOCAL_PART_ADDR = re.compile(r"^\"((?:[^\"\\]|\\.)*)\"@(.*)") +QTEXT_INTL = re.compile(r"[\u0020-\u007E\u0080-\U0010FFFF]") + # Length constants # RFC 3696 + errata 1003 + errata 1690 (https://www.rfc-editor.org/errata_search.php?rfc=3696&eid=1690) # explains the maximum length of an email address is 254 octets. diff --git a/email_validator/syntax.py b/email_validator/syntax.py index 1bd7f3c..b10e5de 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -1,10 +1,12 @@ from .exceptions_types import EmailSyntaxError from .rfc_constants import EMAIL_MAX_LENGTH, LOCAL_PART_MAX_LENGTH, DOMAIN_MAX_LENGTH, \ - DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT_RE, ATEXT_INTL_RE, ATEXT_HOSTNAME_INTL, DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX + DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT_RE, ATEXT_INTL_RE, ATEXT_HOSTNAME_INTL, QTEXT_INTL, \ + DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX import re import unicodedata import idna # implements IDNA 2008; Python's codec is only IDNA 2003 +from typing import Optional def get_length_reason(addr, utf8=False, limit=EMAIL_MAX_LENGTH): @@ -32,7 +34,8 @@ def safe_character_display(c): return unicodedata.name(c, h) -def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_empty_local: bool = False): +def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_empty_local: bool = False, + quoted_local_part: bool = False): """Validates the syntax of the local part of an email address.""" if len(local) == 0: @@ -61,24 +64,32 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp # Check the local part against the non-internationalized regular expression. # Most email addresses match this regex so it's probably fastest to check this first. # (RFC 2822 3.2.4) + # All local parts matching the dot-atom rule are also valid as a quoted string + # so if it was originally quoted (quoted_local_part is True) and this regex matches, + # it's ok. + # (RFC 5321 4.1.2). m = DOT_ATOM_TEXT.match(local) if m: - # It's valid. + # It's valid. And since it's just the permitted ASCII characters, + # it's normalized and safe. If the local part was originally quoted, + # the quoting was unnecessary and it'll be returned as normalized to + # non-quoted form. - # Return the local part unchanged and flag that SMTPUTF8 is not needed. + # Return the local part and flag that SMTPUTF8 is not needed. return { "local_part": local, "ascii_local_part": local, "smtputf8": False, } - # The local part failed the ASCII check. Try the extended character set + # The local part failed the basic dot-atom check. Try the extended character set # for internationalized addresses. It's the same pattern but with additional # characters permitted. + # RFC 6531 section 3.3. + valid: Optional[str] = None + requires_smtputf8 = False m = DOT_ATOM_TEXT_INTL.match(local) if m: - # It's valid. - # But international characters in the local part may not be permitted. if not allow_smtputf8: # Check for invalid characters against the non-internationalized @@ -95,15 +106,56 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp # Although the check above should always find something, fall back to this just in case. raise EmailSyntaxError("Internationalized characters before the @-sign are not supported.") - # RFC 6532 section 3.1 also says that Unicode NFC normalization should be applied, + # It's valid. + valid = "dot-atom" + requires_smtputf8 = True + + # There are no syntactic restrictions on quoted local parts, so if + # it was originally quoted, it is probably valid. More characters + # are allowed, like @-signs, spaces, and quotes, and there are no + # restrictions on the placement of dots, as in dot-atom local parts. + elif quoted_local_part: + # Check for invalid characters in a quoted string local part. + # (RFC 5321 4.1.2. RFC 5322 lists additional permitted *obsolete* + # characters which are *not* allowed here. RFC 6531 section 3.3 + # extends the range to UTF8 strings.) + bad_chars = set( + safe_character_display(c) + for c in local + if not QTEXT_INTL.match(c) + ) + if bad_chars: + raise EmailSyntaxError("The email address contains invalid characters in quotes before the @-sign: " + ", ".join(sorted(bad_chars)) + ".") + + # See if any characters are outside of the ASCII range. + bad_chars = set( + safe_character_display(c) + for c in local + if not (32 <= ord(c) <= 126) + ) + if bad_chars: + requires_smtputf8 = True + + # International characters in the local part may not be permitted. + if not allow_smtputf8: + raise EmailSyntaxError("Internationalized characters before the @-sign are not supported: " + ", ".join(sorted(bad_chars)) + ".") + + # It's valid. + valid = "quoted" + + # If the local part matches the internationalized dot-atom form or was quoted, + # perform normalization and additional checks for Unicode strings. + if valid: + # RFC 6532 section 3.1 says that Unicode NFC normalization should be applied, # so we'll return the normalized local part in the return value. local = unicodedata.normalize("NFC", local) # Check that the local part is a valid, safe, and sensible Unicode string. # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked - # by DOT_ATOM_TEXT_INTL. Other characters may be permitted by the email specs, but - # they may not be valid, safe, or sensible Unicode strings. - check_unsafe_chars(local) + # by DOT_ATOM_TEXT_INTL and QTEXT_INTL. Other characters may be permitted by the + # email specs, but they may not be valid, safe, or sensible Unicode strings. + # See the function for rationale. + check_unsafe_chars(local, allow_space=(valid == "quoted")) # Try encoding to UTF-8. Failure is possible with some characters like # surrogate code points, but those are checked above. Still, we don't @@ -113,15 +165,22 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp except ValueError: raise EmailSyntaxError("The email address contains an invalid character.") - # Flag that SMTPUTF8 will be required for deliverability. + # If this address passes only by the quoted string form, re-quote it + # and backslash-escape quotes and backslashes (removing any unnecessary + # escapes). Per RFC 5321 4.1.2, "all quoted forms MUST be treated as equivalent, + # and the sending system SHOULD transmit the form that uses the minimum quoting possible." + if valid == "quoted": + local = '"' + re.sub(r'(["\\])', r'\\\1', local) + '"' + return { "local_part": local, - "ascii_local_part": None, # no ASCII form is possible - "smtputf8": True, + "ascii_local_part": local if not requires_smtputf8 else None, + "smtputf8": requires_smtputf8, } - # It's not a valid local part either non-internationalized or internationalized. - # Let's find out why. + # It's not a valid local part. Let's find out why. + # (Since quoted local parts are all valid or handled above, these checks + # don't apply in those cases.) # Check for invalid characters. # (RFC 2822 Section 3.2.4 / RFC 5322 Section 3.2.3, plus RFC 6531 section 3.3) @@ -142,7 +201,7 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp raise EmailSyntaxError("The email address contains invalid characters before the @-sign.") -def check_unsafe_chars(s): +def check_unsafe_chars(s, allow_space=False): # Check for unsafe characters or characters that would make the string # invalid or non-sensible Unicode. bad_chars = set() @@ -158,13 +217,25 @@ def check_unsafe_chars(s): # sensible. if i == 0: bad_chars.add(c) + elif category == "Zs": + # Spaces outside of the ASCII range are not specifically disallowed in + # internationalized addresses as far as I can tell, but they violate + # the spirit of the non-internationalized specification that email + # addresses do not contain ASCII spaces when not quoted. Excluding + # ASCII spaces when not quoted is handled directly by the atom regex. + # + # In quoted-string local parts, spaces are explicitly permitted, and + # the ASCII space has category Zs, so we must allow it here, and we'll + # allow all Unicode spaces to be consistent. + if not allow_space: + bad_chars.add(c) elif category[0] == "Z": - # Spaces and line/paragraph characters (Z) outside of the ASCII range - # are not specifically disallowed as far as I can tell, but they - # violate the spirit of the non-internationalized specification that - # email addresses do not contain spaces or line breaks when not quoted. + # The two line and paragraph separator characters (in categories Zl and Zp) + # are not specifically disallowed in internationalized addresses + # as far as I can tell, but they violate the spirit of the non-internationalized + # specification that email addresses do not contain line breaks when not quoted. bad_chars.add(c) - elif category[0] == "C": + elif category[0] in ("C", "Z"): # Control, format, surrogate, private use, and unassigned code points (C) # are all unsafe in various ways. Control and format characters can affect # text rendering if the email address is concatenated with other text. diff --git a/email_validator/validate_email.py b/email_validator/validate_email.py index 8e05498..28f1151 100644 --- a/email_validator/validate_email.py +++ b/email_validator/validate_email.py @@ -2,7 +2,7 @@ from .exceptions_types import EmailSyntaxError, ValidatedEmail from .syntax import validate_email_local_part, validate_email_domain_part, get_length_reason -from .rfc_constants import EMAIL_MAX_LENGTH +from .rfc_constants import EMAIL_MAX_LENGTH, QUOTED_LOCAL_PART_ADDR def validate_email( @@ -11,6 +11,7 @@ def validate_email( *, allow_smtputf8: Optional[bool] = None, allow_empty_local: bool = False, + allow_quoted_local: Optional[bool] = None, check_deliverability: Optional[bool] = None, test_environment: Optional[bool] = None, globally_deliverable: Optional[bool] = None, @@ -24,9 +25,12 @@ def validate_email( """ # Fill in default values of arguments. - from . import ALLOW_SMTPUTF8, CHECK_DELIVERABILITY, TEST_ENVIRONMENT, GLOBALLY_DELIVERABLE, DEFAULT_TIMEOUT + from . import ALLOW_SMTPUTF8, ALLOW_QUOTED_LOCAL, GLOBALLY_DELIVERABLE, \ + CHECK_DELIVERABILITY, TEST_ENVIRONMENT, DEFAULT_TIMEOUT if allow_smtputf8 is None: allow_smtputf8 = ALLOW_SMTPUTF8 + if allow_quoted_local is None: + allow_quoted_local = ALLOW_QUOTED_LOCAL if check_deliverability is None: check_deliverability = CHECK_DELIVERABILITY if test_environment is None: @@ -45,25 +49,48 @@ def validate_email( except ValueError: raise EmailSyntaxError("The email address is not valid ASCII.") - # At-sign. - parts = email.split('@') - if len(parts) != 2: - raise EmailSyntaxError("The email address is not valid. It must have exactly one @-sign.") + # Typical email addresses have a single @-sign, but the + # awkward "quoted string" local part form (RFC 5321 4.1.2) + # allows @-signs (and escaped quotes) to appear in the local + # part if the local part is quoted. If the address is quoted, + # split it at a non-escaped @-sign and unescape the escaping. + quoted_local_part = False + m = QUOTED_LOCAL_PART_ADDR.match(email) + if m: + quoted_local_part = True + local_part, domain_part = m.groups() + + # Remove backslashes. + import re + local_part = re.sub(r"\\(.)", "\\1", local_part) + + else: + # Split at the one and only at-sign. + parts = email.split('@') + if len(parts) != 2: + raise EmailSyntaxError("The email address is not valid. It must have exactly one @-sign.") + local_part, domain_part = parts # Collect return values in this instance. ret = ValidatedEmail() ret.original_email = email # Validate the email address's local part syntax and get a normalized form. - local_part_info = validate_email_local_part(parts[0], + # If the original address was quoted and the decoded local part is a valid + # unquoted local part, then we'll get back a normalized (unescaped) local + # part. + local_part_info = validate_email_local_part(local_part, allow_smtputf8=allow_smtputf8, - allow_empty_local=allow_empty_local) + allow_empty_local=allow_empty_local, + quoted_local_part=quoted_local_part) + if quoted_local_part and not allow_quoted_local: + raise EmailSyntaxError("Quoting the part before the @-sign is not allowed here.") ret.local_part = local_part_info["local_part"] ret.ascii_local_part = local_part_info["ascii_local_part"] ret.smtputf8 = local_part_info["smtputf8"] # Validate the email address's domain part syntax and get a normalized form. - domain_part_info = validate_email_domain_part(parts[1], test_environment=test_environment, globally_deliverable=globally_deliverable) + domain_part_info = validate_email_domain_part(domain_part, test_environment=test_environment, globally_deliverable=globally_deliverable) ret.domain = domain_part_info["domain"] ret.ascii_domain = domain_part_info["ascii_domain"] diff --git a/tests/test_syntax.py b/tests/test_syntax.py index acef671..1c0954e 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -224,21 +224,29 @@ def test_email_valid_intl_local_part(email_input, output): @pytest.mark.parametrize( - 'email_input,error_msg', + 'email_input,normalized_local_part', [ - ('"unnecessarily.quoted.local.part"@example.com', 'The email address contains invalid characters before the @-sign: \'"\'.'), - ('"quoted..local.part"@example.com', 'The email address contains invalid characters before the @-sign: \'"\'.'), - ('"quoted.with.at@"@example.com', 'The email address is not valid. It must have exactly one @-sign.'), - ('"quoted with space"@example.com', 'The email address contains invalid characters before the @-sign: \'\"\', SPACE.'), - ('"quoted.with.dquote\\""@example.com', 'The email address contains invalid characters before the @-sign: "\\", \'"\'.'), - ('"unnecessarily.quoted.with.unicode.λ"@example.com', 'The email address contains invalid characters before the @-sign: \'"\'.'), - ('"quoted.with..unicode.λ"@example.com', 'The email address contains invalid characters before the @-sign: \'"\'.'), - ('"quoted.with.extraneous.\\escape"@example.com', 'The email address contains invalid characters before the @-sign: "\\", \'"\'.'), + ('"unnecessarily.quoted.local.part"@example.com', 'unnecessarily.quoted.local.part'), + ('"quoted..local.part"@example.com', '"quoted..local.part"'), + ('"quoted.with.at@"@example.com', '"quoted.with.at@"'), + ('"quoted with space"@example.com', '"quoted with space"'), + ('"quoted.with.dquote\\""@example.com', '"quoted.with.dquote\\""'), + ('"unnecessarily.quoted.with.unicode.λ"@example.com', 'unnecessarily.quoted.with.unicode.λ'), + ('"quoted.with..unicode.λ"@example.com', '"quoted.with..unicode.λ"'), + ('"quoted.with.extraneous.\\escape"@example.com', 'quoted.with.extraneous.escape'), ]) -def test_email_valid_only_if_quoted_local_part(email_input, error_msg): +def test_email_valid_only_if_quoted_local_part(email_input, normalized_local_part): + # These addresses are invalid with the default allow_quoted_local=False option. with pytest.raises(EmailSyntaxError) as exc_info: validate_email(email_input) - assert str(exc_info.value) == error_msg + assert str(exc_info.value) == 'Quoting the part before the @-sign is not allowed here.' + + # But they are valid if quoting is allowed. + validated = validate_email(email_input, allow_quoted_local=True, check_deliverability=False) + + # Check that the normalized form correctly removed unnecessary backslash escaping + # and even the quoting if they weren't necessary. + assert validated.local_part == normalized_local_part @pytest.mark.parametrize( @@ -329,6 +337,8 @@ def test_email_invalid_reserved_domain(email_input): ('s', 'expected_error'), [ ('\u2005', 'FOUR-PER-EM SPACE'), # four-per-em space (Zs) + ('\u2028', 'LINE SEPARATOR'), # line separator (Zl) + ('\u2029', 'PARAGRAPH SEPARATOR'), # paragraph separator (Zp) ('\u0300', 'COMBINING GRAVE ACCENT'), # grave accent (M) ('\u009C', 'U+009C'), # string terminator (Cc) ('\u200B', 'ZERO WIDTH SPACE'), # zero-width space (Cf) @@ -356,6 +366,7 @@ def test_email_unsafe_character(s, expected_error): ('email_input', 'expected_error'), [ ('λambdaツ@test', 'Internationalized characters before the @-sign are not supported: \'λ\', \'ツ\'.'), + ('"quoted.with..unicode.λ"@example.com', 'Internationalized characters before the @-sign are not supported: \'λ\'.'), ], ) def test_email_invalid_character_smtputf8_off(email_input, expected_error): @@ -365,6 +376,13 @@ def test_email_invalid_character_smtputf8_off(email_input, expected_error): assert str(exc_info.value) == expected_error +def test_email_empty_local(): + validate_email("@test", allow_empty_local=True, test_environment=True) + + # This next one might not be desirable. + validate_email("\"\"@test", allow_empty_local=True, allow_quoted_local=True, test_environment=True) + + def test_email_test_domain_name_in_test_environment(): validate_email("anything@test", test_environment=True) validate_email("anything@mycompany.test", test_environment=True) @@ -424,7 +442,7 @@ def test_email_test_domain_name_in_test_environment(): ['a@abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghikl.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghikl.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghikl.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefg.hij', 'ISEMAIL_RFC5322_TOOLONG'], ['a@abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghikl.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghikl.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghikl.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefg.hijk', 'ISEMAIL_RFC5322_DOMAIN_TOOLONG'], ['"test"@iana.org', 'ISEMAIL_RFC5321_QUOTEDSTRING'], - ['""@iana.org', 'ISEMAIL_RFC5321_QUOTEDSTRING'], + # ['""@iana.org', 'ISEMAIL_RFC5321_QUOTEDSTRING'], # we think an empty quoted string should be invalid ['"""@iana.org', 'ISEMAIL_ERR_EXPECTING_ATEXT'], ['"\\a"@iana.org', 'ISEMAIL_RFC5321_QUOTEDSTRING'], ['"\\""@iana.org', 'ISEMAIL_RFC5321_QUOTEDSTRING'], @@ -549,6 +567,13 @@ def test_pyisemail_tests(email_input, status): if status == "ISEMAIL_VALID": # All standard email address forms should not raise an exception. validate_email(email_input, test_environment=True) + + elif status == "ISEMAIL_RFC5321_QUOTEDSTRING": + # Only valid with an option. + with pytest.raises(EmailSyntaxError): + validate_email(email_input, test_environment=True) + validate_email(email_input, allow_quoted_local=True, test_environment=True) + elif "_ERR_" in status or "_TOOLONG" in status \ or "_CFWS_FWS" in status or "_CFWS_COMMENT" in status \ or "_IPV6" in status or status == "ISEMAIL_RFC5322_DOMAIN": @@ -557,13 +582,14 @@ def test_pyisemail_tests(email_input, status): # The ISEMAIL_RFC5322_DOMAIN diagnosis appears to be a syntactically invalid domain. with pytest.raises(EmailSyntaxError): validate_email(email_input, test_environment=True) + elif "_DEPREC_" in status \ - or "RFC5321_QUOTEDSTRING" in status \ or "DOMAINLITERAL" in status or "_DOMLIT_" in status or "_ADDRESSLITERAL" in status: - # Quoted strings in the local part, domain literals (IP addresses in brackets), + # Domain literals (IP addresses in brackets) # and other deprecated syntax are valid email addresses and are accepted by pyIsEmail, # but we reject them. with pytest.raises(EmailSyntaxError): validate_email(email_input, test_environment=True) + else: raise ValueError(f"status {status} is not recognized") From 18106ca560a8acbfa89a1e8d1330ca3a07108c2d Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sun, 9 Apr 2023 10:54:47 -0400 Subject: [PATCH 090/174] In the __main__ tool read options to validate_email from environment variables --- CHANGELOG.md | 1 + email_validator/__main__.py | 18 ++++++++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ff57248..c52ed57 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ There are no significant changes to which email addresses are considered valid/i * Some other error messages have changed to not repeat the email address in the error message. * The library has been reorganized internally into smaller modules. * The tests have been reorganized and expanded. Deliverability tests now mostly use captured DNS responses so they can be run off-line. +* The __main__ tool now reads options to validate_email from environment variables. * Type annotations have been added to the exported methods and the ValidatedEmail class and some internal methods. Version 1.3.1 (January 21, 2023) diff --git a/email_validator/__main__.py b/email_validator/__main__.py index 9330553..52d1054 100644 --- a/email_validator/__main__.py +++ b/email_validator/__main__.py @@ -10,8 +10,12 @@ # invalid email addresses. When passing an email address on the command # line, if the email address is valid, information about it will be printed. # When using STDIN, no output will be given for valid email addresses. +# +# Keyword arguments to validate_email can be set in environment variables +# of the same name but upprcase (see below). import json +import os import sys from .validate_email import validate_email @@ -22,20 +26,30 @@ def main(dns_resolver=None): # The dns_resolver argument is for tests. + # Set options from environment variables. + options = {} + for varname in ('ALLOW_SMTPUTF8', 'ALLOW_QUOTED_LOCAL', 'GLOBALLY_DELIVERABLE', + 'CHECK_DELIVERABILITY', 'TEST_ENVIRONMENT'): + if varname in os.environ: + options[varname.lower()] = bool(os.environ[varname]) + for varname in ('DEFAULT_TIMEOUT',): + if varname in os.environ: + options[varname.lower()] = float(os.environ[varname]) + if len(sys.argv) == 1: # Validate the email addresses pased line-by-line on STDIN. dns_resolver = dns_resolver or caching_resolver() for line in sys.stdin: email = line.strip() try: - validate_email(email, dns_resolver=dns_resolver) + validate_email(email, dns_resolver=dns_resolver, **options) except EmailNotValidError as e: print(f"{email} {e}") else: # Validate the email address passed on the command line. email = sys.argv[1] try: - result = validate_email(email, dns_resolver=dns_resolver) + result = validate_email(email, dns_resolver=dns_resolver, **options) print(json.dumps(result.as_dict(), indent=2, sort_keys=True, ensure_ascii=False)) except EmailNotValidError as e: print(e) From 416aeb65c1cb354738b933991d456b52244f93ec Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Tue, 11 Apr 2023 16:04:58 -0400 Subject: [PATCH 091/174] Parse domain-literal addresses, give nice error messages, and add an option to permit them, but reject them by default --- CHANGELOG.md | 4 +- README.md | 31 +++++++------ email_validator/__init__.py | 1 + email_validator/__main__.py | 4 +- email_validator/exceptions_types.py | 8 +++- email_validator/rfc_constants.py | 3 ++ email_validator/syntax.py | 69 ++++++++++++++++++++++++++--- email_validator/validate_email.py | 34 +++++++++++--- tests/test_syntax.py | 61 +++++++++++++++++++++---- 9 files changed, 175 insertions(+), 40 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c52ed57..f857b28 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,13 +3,13 @@ This is a pre-release for version 2.0.0. -There are no significant changes to which email addresses are considered valid/invalid, but there are many changes in error messages and internal improvements to the library including the addition of type annotations, and Python 3.7+ is now required. +There are no significant changes to which email addresses are considered valid/invalid with default options, but there are many changes in error messages and internal improvements to the library including the addition of type annotations. New options are added to allow quoted-string local parts and domain-literal addresses, but they are off by default. And Python 3.7+ is now required. * Python 2.x and 3.x versions through 3.6, and dnspython 1.x, are no longer supported. Python 3.7+ with dnspython 2.x are now required. * The dnspython package is no longer required if DNS checks are not used, although it will install automatically. * NoNameservers and NXDOMAIN DNS errors are now handled differently: NoNameservers no longer fails validation, and NXDOMAIN now skips checking for an A/AAAA fallback and goes straight to failing validation. * Some syntax error messages have changed because they are now checked explicitly rather than as a part of other checks. -* The quoted-string local part syntax (e.g. multiple @-signs, spaces, etc. if surrounded by quotes) is now parsed but not considered valid by default. Better error messages are now given for quoted-string syntax since it can be confusing for a technically valid address to be rejected, and a new allow_quoted_local option is added to allow these addresses if you really need them. +* The quoted-string local part syntax (e.g. multiple @-signs, spaces, etc. if surrounded by quotes) and domain-literal addresses (e.g. @[192.XXX...] or @[IPv6:...]) are now parsed but not considered valid by default. Better error messages are now given for these addresses since it can be confusing for a technically valid address to be rejected, and new allow_quoted_local and allow_domain_literal options are added to allow these addresses if you really need them. * Some other error messages have changed to not repeat the email address in the error message. * The library has been reorganized internally into smaller modules. * The tests have been reorganized and expanded. Deliverability tests now mostly use captured DNS responses so they can be run off-line. diff --git a/README.md b/README.md index fd88934..563e36b 100644 --- a/README.md +++ b/README.md @@ -14,13 +14,12 @@ Key features: * Checks that an email address has the correct syntax --- good for registration/login forms or other uses related to identifying users. - Rejects obsolete email address syntax that you'd find unexpected. + By default, rejects obsolete email address syntax that you'd find unexpected. * Gives friendly English error messages when validation fails that you can display to end-users. * Checks deliverability (optional): Does the domain name resolve? (You can override the default DNS resolver to add query caching.) -* Supports internationalized domain names and internationalized local parts, - and with an option deprecated quoted-string local parts. +* Supports internationalized domain names and internationalized local parts. Blocks unsafe characters for your safety. * Normalizes email addresses (important for internationalized and quoted-string addresses! see below). @@ -107,7 +106,7 @@ other information. The validator doesn't, by default, permit obsoleted forms of email addresses that no one uses anymore even though they are still valid and deliverable, since they will probably give you grief if you're using email for login. (See -later in the document about that.) +later in the document about how to allow some obsolete forms.) The validator checks that the domain name in the email address has a DNS MX record (except a NULL MX record) indicating that it can receive @@ -137,6 +136,8 @@ The `validate_email` function also accepts the following keyword arguments `allow_quoted_local=False`: Set to `True` to allow obscure and potentially problematic email addresses in which the part of the address before the @-sign contains spaces, @-signs, or other surprising characters when the local part is surrounded in quotes (so-called quoted-string local parts). In the object returned by `validate_email`, the normalized local part removes any unnecessary backslash-escaping and even removes the surrounding quotes if the address would be valid without them. You can also set `email_validator.ALLOW_QUOTED_LOCAL` to `True` to turn this on for all calls by default. +`allow_domain_literal=False`: Set to `True` to allow bracketed IPv4 and "IPv6:"-prefixd IPv6 addresses in the domain part of the email address. No deliverability checks are performed for these addresses. In the object returned by `validate_email`, the normalized domain will use the condensed IPv6 format, if applicable. The object's `domain_address` attribute will hold the parsed `ipaddress.IPv4Address` or `ipaddress.IPv6Address` object if applicable. You can also set `email_validator.ALLOW_DOMAIN_LITERAL` to `True` to turn this on for all calls by default. + `allow_empty_local=False`: Set to `True` to allow an empty local part (i.e. `@example.com`), e.g. for validating Postfix aliases. @@ -291,10 +292,12 @@ and conversion from Punycode to Unicode characters. 3.1](https://tools.ietf.org/html/rfc6532#section-3.1) and [RFC 5895 (IDNA 2008) section 2](http://www.ietf.org/rfc/rfc5895.txt).) -Normalization is also applied to quoted-string local parts if you have -allowed them by the `allow_quoted_local` option. Unnecessary backslash -escaping is removed and even the surrounding quotes are removed if they -are unnecessary. +Normalization is also applied to quoted-string local parts and domain +literal IPv6 addresses if you have allowed them by the `allow_quoted_local` +and `allow_domain_literal` options. In quoted-string local parts, unnecessary +backslash escaping is removed and even the surrounding quotes are removed if +they are unnecessary. For IPv6 domain literals, the IPv6 address is +normalized to condensed form. Examples -------- @@ -369,6 +372,7 @@ are: | `ascii_local_part` | If set, the local part, which is composed of ASCII characters only. | | `domain` | The canonical internationalized Unicode form of the domain part of the email address. If the returned string contains non-ASCII characters, either the [SMTPUTF8](https://tools.ietf.org/html/rfc6531) feature of your mail relay will be required to transmit the message or else the email address's domain part must be converted to IDNA ASCII first: Use `ascii_domain` field instead. | | `ascii_domain` | The [IDNA](https://tools.ietf.org/html/rfc5891) [Punycode](https://www.rfc-editor.org/rfc/rfc3492.txt)-encoded form of the domain part of the given email address, as it would be transmitted on the wire. | +| `domain_address` | If domain literals are allowed and if the email address contains one, an `ipaddress.IPv4Address` or `ipaddress.IPv6Address` object. | | `smtputf8` | A boolean indicating that the [SMTPUTF8](https://tools.ietf.org/html/rfc6531) feature of your mail relay will be required to transmit messages to this address because the local part of the address has non-ASCII characters (the local part cannot be IDNA-encoded). If `allow_smtputf8=False` is passed as an argument, this flag will always be false because an exception is raised if it would have been true. | | `mx` | A list of (priority, domain) tuples of MX records specified in the DNS for the domain (see [RFC 5321 section 5](https://tools.ietf.org/html/rfc5321#section-5)). May be `None` if the deliverability check could not be completed because of a temporary issue like a timeout. | | `mx_fallback_type` | `None` if an `MX` record is found. If no MX records are actually specified in DNS and instead are inferred, through an obsolete mechanism, from A or AAAA records, the value is the type of DNS record used instead (`A` or `AAAA`). May be `None` if the deliverability check could not be completed because of a temporary issue like a timeout. | @@ -390,13 +394,12 @@ or likely to cause trouble: domain names without a `.`, are rejected as a syntax error (except see the `test_environment` parameter above). * Obsolete email syntaxes are rejected: - The "quoted string" form of the local part of the email address (RFC - 5321 4.1.2) is not permitted unless `allow_quoted_local=True` is given - (see above). The unusual ["(comment)" syntax](https://github.com/JoshData/python-email-validator/issues/77) - is also rejected. The "literal" form for the domain part of an email address (an - IP address in brackets) is rejected. Other obsolete and deprecated syntaxes are - rejected. No one uses these forms anymore. + is rejected. Extremely old obsolete syntaxes are + rejected. Quoted-string local parts and domain-literal addresses + are rejected by default, but there are options to allow them (see above). + No one uses these forms anymore, and I can't think of any reason why anyone + using this library would need to accept them. Testing diff --git a/email_validator/__init__.py b/email_validator/__init__.py index aa0dc7c..d5f26a2 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -26,6 +26,7 @@ def caching_resolver(*args, **kwargs): ALLOW_SMTPUTF8 = True ALLOW_QUOTED_LOCAL = False +ALLOW_DOMAIN_LITERAL = False GLOBALLY_DELIVERABLE = True CHECK_DELIVERABILITY = True TEST_ENVIRONMENT = False diff --git a/email_validator/__main__.py b/email_validator/__main__.py index 52d1054..a414ff6 100644 --- a/email_validator/__main__.py +++ b/email_validator/__main__.py @@ -28,8 +28,8 @@ def main(dns_resolver=None): # Set options from environment variables. options = {} - for varname in ('ALLOW_SMTPUTF8', 'ALLOW_QUOTED_LOCAL', 'GLOBALLY_DELIVERABLE', - 'CHECK_DELIVERABILITY', 'TEST_ENVIRONMENT'): + for varname in ('ALLOW_SMTPUTF8', 'ALLOW_QUOTED_LOCAL', 'ALLOW_DOMAIN_LITERAL', + 'GLOBALLY_DELIVERABLE', 'CHECK_DELIVERABILITY', 'TEST_ENVIRONMENT'): if varname in os.environ: options[varname.lower()] = bool(os.environ[varname]) for varname in ('DEFAULT_TIMEOUT',): diff --git a/email_validator/exceptions_types.py b/email_validator/exceptions_types.py index 176d50e..d623301 100644 --- a/email_validator/exceptions_types.py +++ b/email_validator/exceptions_types.py @@ -36,6 +36,9 @@ class ValidatedEmail(object): Unicode from IDNA ascii.""" domain: str + """If the domain part is a domain literal, the IPv4Address or IPv6Address object.""" + domain_address: object + """If not None, a form of the email address that uses 7-bit ASCII characters only.""" ascii_email: Optional[str] @@ -118,4 +121,7 @@ def as_constructor(self): """Convenience method for accessing ValidatedEmail as a dict""" def as_dict(self): - return self.__dict__ + d = self.__dict__ + if d.get('domain_address'): + d['domain_address'] = repr(d['domain_address']) + return d diff --git a/email_validator/rfc_constants.py b/email_validator/rfc_constants.py index cfbde12..00b6358 100644 --- a/email_validator/rfc_constants.py +++ b/email_validator/rfc_constants.py @@ -27,6 +27,9 @@ DOT_ATOM_TEXT_HOSTNAME = re.compile(HOSTNAME_LABEL + r'(?:\.' + HOSTNAME_LABEL + r')*\Z') DOMAIN_NAME_REGEX = re.compile(r"[A-Za-z]\Z") # all TLDs currently end with a letter +# Domain literal (RFC 5322 3.4.1) +DOMAIN_LITERAL_CHARS = re.compile(r"[\u0021-\u00FA\u005E-\u007E]") + # Quoted-string local part (RFC 5321 4.1.2, internationalized by RFC 6531 section 3.3) # The permitted characters in a quoted string are the characters in the range # 32-126, except that quotes and (literal) backslashes can only appear when escaped diff --git a/email_validator/syntax.py b/email_validator/syntax.py index b10e5de..87e9ede 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -1,11 +1,12 @@ from .exceptions_types import EmailSyntaxError from .rfc_constants import EMAIL_MAX_LENGTH, LOCAL_PART_MAX_LENGTH, DOMAIN_MAX_LENGTH, \ DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT_RE, ATEXT_INTL_RE, ATEXT_HOSTNAME_INTL, QTEXT_INTL, \ - DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX + DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX, DOMAIN_LITERAL_CHARS import re import unicodedata import idna # implements IDNA 2008; Python's codec is only IDNA 2003 +import ipaddress from typing import Optional @@ -272,13 +273,9 @@ def check_dot_atom(label, start_descr, end_descr, is_hostname): raise EmailSyntaxError("An email address cannot have a period and a hyphen next to each other.") -def validate_email_domain_part(domain, test_environment=False, globally_deliverable=True): +def validate_email_domain_name(domain, test_environment=False, globally_deliverable=True): """Validates the syntax of the domain part of an email address.""" - # Empty? - if len(domain) == 0: - raise EmailSyntaxError("There must be something after the @-sign.") - # Check for invalid characters before normalization. # (RFC 952 plus RFC 6531 section 3.3 for internationalized addresses) bad_chars = set( @@ -432,3 +429,63 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera "ascii_domain": ascii_domain, "domain": domain_i18n, } + + +def validate_email_domain_literal(domain_literal, allow_domain_literal=False): + # This is obscure domain-literal syntax. Parse it and return + # a compressed/normalized address. + # RFC 5321 4.1.3 and RFC 5322 3.4.1. + + # Try to parse the domain literal as an IPv4 address. + # There is no tag for IPv4 addresses, so we can never + # be sure if the user intends an IPv4 address. + if re.match(r"^[0-9\.]+$", domain_literal): + try: + addr = ipaddress.IPv4Address(domain_literal) + except ValueError as e: + raise EmailSyntaxError(f"The address in brackets after the @-sign is not valid: It is not an IPv4 address ({e}) or is missing an address literal tag.") + if not allow_domain_literal: + raise EmailSyntaxError("A bracketed IPv4 address after the @-sign is not allowed here.") + + # Return the IPv4Address object and the domain back unchanged. + return { + "domain_address": addr, + "domain": f"[{addr}]", + } + + # If it begins with "IPv6:" it's an IPv6 address. + if domain_literal.startswith("IPv6:"): + try: + addr = ipaddress.IPv6Address(domain_literal[5:]) + except ValueError as e: + raise EmailSyntaxError(f"The IPv6 address in brackets after the @-sign is not valid ({e}).") + if not allow_domain_literal: + raise EmailSyntaxError("A bracketed IPv6 address after the @-sign is not allowed here.") + + # Return the IPv6Address object and construct a normalized + # domain literal. + return { + "domain_address": addr, + "domain": f"[IPv6:{addr.compressed}]", + } + + if ":" not in domain_literal: + raise EmailSyntaxError("The part after the @-sign in brackets is not an IPv4 address and has no address literal tag.") + + # The tag (the part before the colon) has character restrictions, + # but since it must come from a registry of tags (in which only "IPv6" is defined), + # there's no need to check the syntax of the tag. See RFC 5321 4.1.2. + + # Check for permitted ASCII characters. This actually doesn't matter + # since there will be an exception after anyway. + bad_chars = set( + safe_character_display(c) + for c in domain_literal + if not DOMAIN_LITERAL_CHARS.match(c) + ) + if bad_chars: + raise EmailSyntaxError("The part after the @-sign contains invalid characters in brackets: " + ", ".join(sorted(bad_chars)) + ".") + + # There are no other domain literal tags. + # https://www.iana.org/assignments/address-literal-tags/address-literal-tags.xhtml + raise EmailSyntaxError("The part after the @-sign contains an invalid address literal tag in brackets.") diff --git a/email_validator/validate_email.py b/email_validator/validate_email.py index 28f1151..4ce11cb 100644 --- a/email_validator/validate_email.py +++ b/email_validator/validate_email.py @@ -1,7 +1,7 @@ from typing import Optional, Union from .exceptions_types import EmailSyntaxError, ValidatedEmail -from .syntax import validate_email_local_part, validate_email_domain_part, get_length_reason +from .syntax import validate_email_local_part, validate_email_domain_name, validate_email_domain_literal, get_length_reason from .rfc_constants import EMAIL_MAX_LENGTH, QUOTED_LOCAL_PART_ADDR @@ -12,6 +12,7 @@ def validate_email( allow_smtputf8: Optional[bool] = None, allow_empty_local: bool = False, allow_quoted_local: Optional[bool] = None, + allow_domain_literal: Optional[bool] = None, check_deliverability: Optional[bool] = None, test_environment: Optional[bool] = None, globally_deliverable: Optional[bool] = None, @@ -25,12 +26,14 @@ def validate_email( """ # Fill in default values of arguments. - from . import ALLOW_SMTPUTF8, ALLOW_QUOTED_LOCAL, GLOBALLY_DELIVERABLE, \ - CHECK_DELIVERABILITY, TEST_ENVIRONMENT, DEFAULT_TIMEOUT + from . import ALLOW_SMTPUTF8, ALLOW_QUOTED_LOCAL, ALLOW_DOMAIN_LITERAL, \ + GLOBALLY_DELIVERABLE, CHECK_DELIVERABILITY, TEST_ENVIRONMENT, DEFAULT_TIMEOUT if allow_smtputf8 is None: allow_smtputf8 = ALLOW_SMTPUTF8 if allow_quoted_local is None: allow_quoted_local = ALLOW_QUOTED_LOCAL + if allow_domain_literal is None: + allow_domain_literal = ALLOW_DOMAIN_LITERAL if check_deliverability is None: check_deliverability = CHECK_DELIVERABILITY if test_environment is None: @@ -90,9 +93,24 @@ def validate_email( ret.smtputf8 = local_part_info["smtputf8"] # Validate the email address's domain part syntax and get a normalized form. - domain_part_info = validate_email_domain_part(domain_part, test_environment=test_environment, globally_deliverable=globally_deliverable) - ret.domain = domain_part_info["domain"] - ret.ascii_domain = domain_part_info["ascii_domain"] + is_domain_literal = False + if len(domain_part) == 0: + raise EmailSyntaxError("There must be something after the @-sign.") + + elif domain_part.startswith("[") and domain_part.endswith("]"): + # Parse the address in the domain literal and get back a normalized domain. + domain_part_info = validate_email_domain_literal(domain_part[1:-1], allow_domain_literal=allow_domain_literal) + ret.domain = domain_part_info["domain"] + ret.ascii_domain = domain_part_info["domain"] # Domain literals are always ASCII. + ret.domain_address = domain_part_info["domain_address"] + is_domain_literal = True # Prevent deliverability checks. + + else: + # Check the syntax of the domain and get back a normalized + # internationalized and ASCII form. + domain_part_info = validate_email_domain_name(domain_part, test_environment=test_environment, globally_deliverable=globally_deliverable) + ret.domain = domain_part_info["domain"] + ret.ascii_domain = domain_part_info["ascii_domain"] # Construct the complete normalized form. ret.email = ret.local_part + "@" + ret.domain @@ -148,6 +166,10 @@ def validate_email( # Validate the email address's deliverability using DNS # and update the return dict with metadata. + if is_domain_literal: + # There is nothing to check --- skip deliverability checks. + return ret + # Lazy load `deliverability` as it is slow to import (due to dns.resolver) from .deliverability import validate_email_deliverability deliverability_info = validate_email_deliverability( diff --git a/tests/test_syntax.py b/tests/test_syntax.py index 1c0954e..1c5f0c4 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -249,6 +249,23 @@ def test_email_valid_only_if_quoted_local_part(email_input, normalized_local_par assert validated.local_part == normalized_local_part +def test_domain_literal(): + # Check parsing IPv4 addresses. + validated = validate_email("me@[127.0.0.1]", allow_domain_literal=True) + assert validated.domain == "[127.0.0.1]" + assert repr(validated.domain_address) == "IPv4Address('127.0.0.1')" + + # Check parsing IPv6 addresses. + validated = validate_email("me@[IPv6:::1]", allow_domain_literal=True) + assert validated.domain == "[IPv6:::1]" + assert repr(validated.domain_address) == "IPv6Address('::1')" + + # Check that IPv6 addresses are normalized. + validated = validate_email("me@[IPv6:0000:0000:0000:0000:0000:0000:0000:0001]", allow_domain_literal=True) + assert validated.domain == "[IPv6:::1]" + assert repr(validated.domain_address) == "IPv6Address('::1')" + + @pytest.mark.parametrize( 'email_input,error_msg', [ @@ -304,6 +321,13 @@ def test_email_valid_only_if_quoted_local_part(email_input, normalized_local_par ('me@xn--0.tld', 'The part after the @-sign is not valid IDNA (Invalid A-label).'), ('me@yy--0.tld', 'An email address cannot have two letters followed by two dashes immediately after the @-sign or after a period, except Punycode.'), ('me@yy--0.tld', 'An email address cannot have two letters followed by two dashes immediately after the @-sign or after a period, except Punycode.'), + ('me@[127.0.0.1]', 'A bracketed IPv4 address after the @-sign is not allowed here.'), + ('me@[127.0.0.999]', 'The address in brackets after the @-sign is not valid: It is not an IPv4 address (Octet 999 (> 255) not permitted in \'127.0.0.999\') or is missing an address literal tag.'), + ('me@[IPv6:::1]', 'A bracketed IPv6 address after the @-sign is not allowed here.'), + ('me@[IPv6:::G]', 'The IPv6 address in brackets after the @-sign is not valid (Only hex digits permitted in \'G\' in \'::G\').'), + ('me@[tag:text]', 'The part after the @-sign contains an invalid address literal tag in brackets.'), + ('me@[untaggedtext]', 'The part after the @-sign in brackets is not an IPv4 address and has no address literal tag.'), + ('me@[tag:invalid space]', 'The part after the @-sign contains invalid characters in brackets: SPACE.'), ], ) def test_email_invalid_syntax(email_input, error_msg): @@ -565,31 +589,50 @@ def test_email_test_domain_name_in_test_environment(): ) def test_pyisemail_tests(email_input, status): if status == "ISEMAIL_VALID": - # All standard email address forms should not raise an exception. + # All standard email address forms should not raise an exception + # with any set of parsing options. validate_email(email_input, test_environment=True) + validate_email(email_input, allow_quoted_local=True, allow_domain_literal=True, test_environment=True) elif status == "ISEMAIL_RFC5321_QUOTEDSTRING": - # Only valid with an option. + # Quoted-literal local parts are only valid with an option. with pytest.raises(EmailSyntaxError): validate_email(email_input, test_environment=True) validate_email(email_input, allow_quoted_local=True, test_environment=True) + elif "_ADDRESSLITERAL" in status or status == 'ISEMAIL_RFC5321_IPV6DEPRECATED': + # Domain literals with IPv4 or IPv6 addresses are only valid with an option. + # I am not sure if the ISEMAIL_RFC5321_IPV6DEPRECATED case should be rejected: + # The Python ipaddress module accepts it. + with pytest.raises(EmailSyntaxError): + validate_email(email_input, test_environment=True) + validate_email(email_input, allow_domain_literal=True, test_environment=True) + + elif "_DOMLIT_" in status or "DOMAINLITERAL" in status or "_IPV6" in status: + # Invalid domain literals even when allow_domain_literal=True. + # The _DOMLIT_ diagnoses appear to be invalid domain literals. + # The DOMAINLITERAL diagnoses appear to be valid domain literals that can't + # be parsed as an IPv4 or IPv6 address. + # The _IPV6_ diagnoses appear to represent syntactically invalid domain literals. + with pytest.raises(EmailSyntaxError): + validate_email(email_input, allow_domain_literal=True, test_environment=True) + elif "_ERR_" in status or "_TOOLONG" in status \ or "_CFWS_FWS" in status or "_CFWS_COMMENT" in status \ - or "_IPV6" in status or status == "ISEMAIL_RFC5322_DOMAIN": + or status == "ISEMAIL_RFC5322_DOMAIN": # Invalid syntax, extraneous whitespace, and "(comments)" should be rejected. - # The _IPV6_ diagnoses appear to represent syntactically invalid domain literals. # The ISEMAIL_RFC5322_DOMAIN diagnosis appears to be a syntactically invalid domain. + # These are invalid with any set of options. with pytest.raises(EmailSyntaxError): validate_email(email_input, test_environment=True) + validate_email(email_input, allow_quoted_local=True, allow_domain_literal=True, test_environment=True) - elif "_DEPREC_" in status \ - or "DOMAINLITERAL" in status or "_DOMLIT_" in status or "_ADDRESSLITERAL" in status: - # Domain literals (IP addresses in brackets) - # and other deprecated syntax are valid email addresses and are accepted by pyIsEmail, - # but we reject them. + elif "_DEPREC_" in status: + # Various deprecated syntax are valid email addresses and are accepted by pyIsEmail, + # but we reject them even with extended options. with pytest.raises(EmailSyntaxError): validate_email(email_input, test_environment=True) + validate_email(email_input, allow_quoted_local=True, allow_domain_literal=True, test_environment=True) else: raise ValueError(f"status {status} is not recognized") From 03cdade6dacc8b1a213661d41213cd683922b3dd Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sat, 15 Apr 2023 09:33:35 -0400 Subject: [PATCH 092/174] Some README improvements, remove references to obsoleted RFCs, add a test for an obsoleted quoted-string syntax --- README.md | 87 +++++++++++++++++++------------- email_validator/rfc_constants.py | 13 ++--- email_validator/syntax.py | 14 ++--- tests/test_syntax.py | 1 + 4 files changed, 64 insertions(+), 51 deletions(-) diff --git a/README.md b/README.md index 563e36b..57b186a 100644 --- a/README.md +++ b/README.md @@ -14,22 +14,23 @@ Key features: * Checks that an email address has the correct syntax --- good for registration/login forms or other uses related to identifying users. - By default, rejects obsolete email address syntax that you'd find unexpected. * Gives friendly English error messages when validation fails that you can display to end-users. * Checks deliverability (optional): Does the domain name resolve? (You can override the default DNS resolver to add query caching.) * Supports internationalized domain names and internationalized local parts. - Blocks unsafe characters for your safety. +* Rejects addresses with unsafe Unicode characters, obsolete email address + syntax that you'd find unexpected, special use domain names like + `@localhost`, and domains without a dot by default. This is an + opinionated library! * Normalizes email addresses (important for internationalized and quoted-string addresses! see below). * Python type annotations are used. -This library does NOT permit obsolete forms of email addresses by default, -so if you need strict validation against the email specs exactly, use -[pyIsEmail](https://github.com/michaelherold/pyIsEmail) or try -[flanker](https://github.com/mailgun/flanker) if you are parsing the -"To:" line of an email. +This is an opinionated library. You should definitely also consider using +the less-opinionated [pyIsEmail](https://github.com/michaelherold/pyIsEmail) and +[flanker](https://github.com/mailgun/flanker) if they are better for your +use case. [![Build Status](https://github.com/JoshData/python-email-validator/actions/workflows/test_and_build.yaml/badge.svg)](https://github.com/JoshData/python-email-validator/actions/workflows/test_and_build.yaml) @@ -57,21 +58,23 @@ account in your application, you might do this: ```python from email_validator import validate_email, EmailNotValidError -email = "my+address@mydomain.tld" -is_new_account = True # False for login pages +email = "my+address@example.org" try: - # Check that the email address is valid. - validation = validate_email(email, check_deliverability=is_new_account) - # Take the normalized form of the email address - # for all logic beyond this point (especially - # before going to a database query where equality - # may not take into account Unicode normalization). + # Check that the email address is valid. Turn on check_deliverability + # for first-time validations like on account creation pages (but not + # login pages). + validation = validate_email(email, check_deliverability=False) + + # After this point, use only the normalized form of the email address, + # especially before going to a database query. email = validation.email + except EmailNotValidError as e: - # Email is not valid. - # The exception message is human-readable. + + # The exception message is human-readable explanation of why it's + # not a valid (or deliverable) email address. print(str(e)) ``` @@ -108,12 +111,15 @@ that no one uses anymore even though they are still valid and deliverable, since they will probably give you grief if you're using email for login. (See later in the document about how to allow some obsolete forms.) -The validator checks that the domain name in the email address has a -DNS MX record (except a NULL MX record) indicating that it can receive -email (or a fallback A-record, see below). -There is nothing to be gained by trying to actually contact an SMTP -server, so that's not done here. For privacy, security, and practicality -reasons servers are good at not giving away whether an address is +The validator optionally checks that the domain name in the email address has +a DNS MX record indicating that it can receive email. (Except a Null MX record. +If there is no MX record, a fallback A/AAAA-record is permitted, unless +a reject-all SPF record is present.) DNS is slow and sometimes unavailable or +unreliable, so consider whether these checks are useful for your use case and +turn them off if they aren't. +There is nothing to be gained by trying to actually contact an SMTP server, so +that's not done here. For privacy, security, and practicality reasons, servers +are good at not giving away whether an address is deliverable or not: email addresses that appear to accept mail at first can bounce mail after a delay, and bounced mail may indicate a temporary failure of a good email address (sometimes an intentional failure, like @@ -124,11 +130,11 @@ greylisting). The `validate_email` function also accepts the following keyword arguments (defaults are as shown below): -`check_deliverability=True`: If true, a DNS query is made to check that a non-null MX record is present for the domain-part of the email address (or if not, an A/AAAA record as an MX fallback can be present but in that case a reject-all SPF record must not be present). Set to `False` to skip this DNS-based check. DNS is slow and sometimes unavailable, so consider whether these checks are useful for your use case. It is recommended to pass `False` when performing validation for login pages (but not account creation pages) since re-validation of a previously validated domain in your database by querying DNS at every login is probably undesirable. You can also set `email_validator.CHECK_DELIVERABILITY` to `False` to turn this off for all calls by default. +`check_deliverability=True`: If true, DNS queries are made to check that the domain name in the email address (the part after the @-sign) can receive mail, as described above. Set to `False` to skip this DNS-based check. It is recommended to pass `False` when performing validation for login pages (but not account creation pages) since re-validation of a previously validated domain in your database by querying DNS at every login is probably undesirable. You can also set `email_validator.CHECK_DELIVERABILITY` to `False` to turn this off for all calls by default. -`dns_resolver=None`: Pass an instance of [dns.resolver.Resolver](https://dnspython.readthedocs.io/en/latest/resolver-class.html) to control the DNS resolver including setting a timeout and [a cache](https://dnspython.readthedocs.io/en/latest/resolver-caching.html). The `caching_resolver` function shown above is a helper function to construct a dns.resolver.Resolver with a [LRUCache](https://dnspython.readthedocs.io/en/latest/resolver-caching.html#dns.resolver.LRUCache). Reuse the same resolver instance across calls to `validate_email` to make use of the cache. +`dns_resolver=None`: Pass an instance of [dns.resolver.Resolver](https://dnspython.readthedocs.io/en/latest/resolver-class.html) to control the DNS resolver including setting a timeout and [a cache](https://dnspython.readthedocs.io/en/latest/resolver-caching.html). The `caching_resolver` function shown below is a helper function to construct a dns.resolver.Resolver with a [LRUCache](https://dnspython.readthedocs.io/en/latest/resolver-caching.html#dns.resolver.LRUCache). Reuse the same resolver instance across calls to `validate_email` to make use of the cache. -`test_environment=False`: DNS-based deliverability checks are disabled and `test` and `subdomain.test` domain names are permitted (see below). You can also set `email_validator.TEST_ENVIRONMENT` to `True` to turn it on for all calls by default. +`test_environment=False`: If `True`, DNS-based deliverability checks are disabled and `test` and `**.test` domain names are permitted (see below). You can also set `email_validator.TEST_ENVIRONMENT` to `True` to turn it on for all calls by default. `allow_smtputf8=True`: Set to `False` to prohibit internationalized addresses that would require the @@ -157,18 +163,18 @@ while True: ### Test addresses -This library rejects email addresess that use the [Special Use Domain Names](https://www.iana.org/assignments/special-use-domain-names/special-use-domain-names.xhtml) `invalid`, `localhost`, `test`, and some others by raising `EmailSyntaxError`. This is to protect your system from abuse: You probably don't want a user to be able to cause an email to be sent to `localhost`. However, in your non-production test environments you may want to use `@test` or `@myname.test` email addresses. There are three ways you can allow this: +This library rejects email addresess that use the [Special Use Domain Names](https://www.iana.org/assignments/special-use-domain-names/special-use-domain-names.xhtml) `invalid`, `localhost`, `test`, and some others by raising `EmailSyntaxError`. This is to protect your system from abuse: You probably don't want a user to be able to cause an email to be sent to `localhost` (although they might be able to still do so via a malicious MX record). However, in your non-production test environments you may want to use `@test` or `@myname.test` email addresses. There are three ways you can allow this: 1. Add `test_environment=True` to the call to `validate_email` (see above). -2. Set `email_validator.TEST_ENVIRONMENT` to `True`. -3. Remove the special-use domain name that you want to use from `email_validator.SPECIAL_USE_DOMAIN_NAMES`: +2. Set `email_validator.TEST_ENVIRONMENT` to `True` globally. +3. Remove the special-use domain name that you want to use from `email_validator.SPECIAL_USE_DOMAIN_NAMES`, e.g.: ```python import email_validator email_validator.SPECIAL_USE_DOMAIN_NAMES.remove("test") ``` -It is tempting to use `@example.com/net/org` in tests. These domains are reserved to IANA for use in documentation so there is no risk of accidentally emailing someone at those domains. But beware that this library will reject these domain names if DNS-based deliverability checks are not disabled because these domains do not resolve to domains that accept email. In tests, consider using your own domain name or `@test` or `@myname.test` instead. +It is tempting to use `@example.com/net/org` in tests. They are *not* in this library's `SPECIAL_USE_DOMAIN_NAMES` list so you can, but shouldn't, use them. These domains are reserved to IANA for use in documentation so there is no risk of accidentally emailing someone at those domains. But beware that this library will nevertheless reject these domain names if DNS-based deliverability checks are not disabled because these domains do not resolve to domains that accept email. In tests, consider using your own domain name or `@test` or `@myname.test` instead. Internationalized email addresses --------------------------------- @@ -205,8 +211,9 @@ especially when the email address is concatenated with other text, so this library tries to protect you by not permitting resvered, non-, private use, formatting (which can be used to alter the display order of characters), whitespace, and control characters, and combining characters -as the first character (so that they cannot combine with something outside -of the email address string). See https://qntm.org/safe and https://trojansource.codes/ +as the first character of the local part and the domain name (so that they +cannot combine with something outside of the email address string or with +the @-sign). See https://qntm.org/safe and https://trojansource.codes/ for relevant prior work. (Other than whitespace, these are checks that you should be applying to nearly all user inputs in a security-sensitive context.) @@ -255,17 +262,23 @@ change the user's login information without telling them.) Normalization ------------- +### Unicode Normalization + The use of Unicode in email addresses introduced a normalization problem. Different Unicode strings can look identical and have the same semantic meaning to the user. The `email` field returned on successful validation provides the correctly normalized form of the given email -address: +address. + +For example, the CJK fullwidth Latin letters are considered semantically +equivalent in domain names to their ASCII counterparts. This library +normalizes them to their ASCII counterparts: ```python valid = validate_email("me@Domain.com") -email = valid.ascii_email -print(email) -# prints: me@domain.com +print(valid.email) +print(valid.ascii_email) +# prints "me@domain.com" twice ``` Because an end-user might type their email address in different (but @@ -292,6 +305,8 @@ and conversion from Punycode to Unicode characters. 3.1](https://tools.ietf.org/html/rfc6532#section-3.1) and [RFC 5895 (IDNA 2008) section 2](http://www.ietf.org/rfc/rfc5895.txt).) +### Other Normalization + Normalization is also applied to quoted-string local parts and domain literal IPv6 addresses if you have allowed them by the `allow_quoted_local` and `allow_domain_literal` options. In quoted-string local parts, unnecessary diff --git a/email_validator/rfc_constants.py b/email_validator/rfc_constants.py index 00b6358..96816df 100644 --- a/email_validator/rfc_constants.py +++ b/email_validator/rfc_constants.py @@ -2,16 +2,13 @@ import re -# Based on RFC 2822 section 3.2.4 / RFC 5322 section 3.2.3, these -# characters are permitted in email addresses (not taking into -# account internationalization): +# Based on RFC 5322 3.2.3, these characters are permitted in email +# addresses (not taking into account internationalization) separated by dots: ATEXT = r'a-zA-Z0-9_!#\$%&\'\*\+\-/=\?\^`\{\|\}~' ATEXT_RE = re.compile('[.' + ATEXT + ']') # ATEXT plus dots - -# A "dot atom text", per RFC 2822 3.2.4: DOT_ATOM_TEXT = re.compile('[' + ATEXT + ']+(?:\\.[' + ATEXT + r']+)*\Z') -# RFC 6531 section 3.3 extends the allowed characters in internationalized +# RFC 6531 3.3 extends the allowed characters in internationalized # addresses to also include three specific ranges of UTF8 defined in # RFC 3629 section 4, which appear to be the Unicode code points from # U+0080 to U+10FFFF. @@ -20,7 +17,7 @@ DOT_ATOM_TEXT_INTL = re.compile('[' + ATEXT_INTL + ']+(?:\\.[' + ATEXT_INTL + r']+)*\Z') # The domain part of the email address, after IDNA (ASCII) encoding, -# must also satisfy the requirements of RFC 952/RFC 1123 Section 2.1 which +# must also satisfy the requirements of RFC 952/RFC 1123 2.1 which # restrict the allowed characters of hostnames further. ATEXT_HOSTNAME_INTL = re.compile(r"[a-zA-Z0-9\-\." + "\u0080-\U0010FFFF" + "]") HOSTNAME_LABEL = r'(?:(?:[a-zA-Z0-9][a-zA-Z0-9\-]*)?[a-zA-Z0-9])' @@ -30,7 +27,7 @@ # Domain literal (RFC 5322 3.4.1) DOMAIN_LITERAL_CHARS = re.compile(r"[\u0021-\u00FA\u005E-\u007E]") -# Quoted-string local part (RFC 5321 4.1.2, internationalized by RFC 6531 section 3.3) +# Quoted-string local part (RFC 5321 4.1.2, internationalized by RFC 6531 3.3) # The permitted characters in a quoted string are the characters in the range # 32-126, except that quotes and (literal) backslashes can only appear when escaped # by a backslash. When internationalized, UTF8 strings are also permitted except diff --git a/email_validator/syntax.py b/email_validator/syntax.py index 87e9ede..abb4ea9 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -64,11 +64,11 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp # Check the local part against the non-internationalized regular expression. # Most email addresses match this regex so it's probably fastest to check this first. - # (RFC 2822 3.2.4) + # (RFC 5322 3.2.3) # All local parts matching the dot-atom rule are also valid as a quoted string # so if it was originally quoted (quoted_local_part is True) and this regex matches, # it's ok. - # (RFC 5321 4.1.2). + # (RFC 5321 4.1.2 / RFC 5322 3.2.4). m = DOT_ATOM_TEXT.match(local) if m: # It's valid. And since it's just the permitted ASCII characters, @@ -95,7 +95,7 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp if not allow_smtputf8: # Check for invalid characters against the non-internationalized # permitted character set. - # (RFC 2822 Section 3.2.4 / RFC 5322 Section 3.2.3) + # (RFC 5322 3.2.3) bad_chars = set( safe_character_display(c) for c in local @@ -184,7 +184,7 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp # don't apply in those cases.) # Check for invalid characters. - # (RFC 2822 Section 3.2.4 / RFC 5322 Section 3.2.3, plus RFC 6531 section 3.3) + # (RFC 5322 3.2.3, plus RFC 6531 3.3) bad_chars = set( safe_character_display(c) for c in local @@ -194,7 +194,7 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp raise EmailSyntaxError("The email address contains invalid characters before the @-sign: " + ", ".join(sorted(bad_chars)) + ".") # Check for dot errors imposted by the dot-atom rule. - # (RFC 2822 3.2.4) + # (RFC 5322 3.2.3) check_dot_atom(local, 'An email address cannot start with a {}.', 'An email address cannot have a {} immediately before the @-sign.', is_hostname=False) # All of the reasons should already have been checked, but just in case @@ -255,7 +255,7 @@ def check_unsafe_chars(s, allow_space=False): def check_dot_atom(label, start_descr, end_descr, is_hostname): - # RFC 2822 3.2.4 + # RFC 5322 3.2.3 if label.endswith("."): raise EmailSyntaxError(end_descr.format("period")) if label.startswith("."): @@ -308,7 +308,7 @@ def validate_email_domain_name(domain, test_environment=False, globally_delivera # Check that before we do IDNA encoding because the IDNA library gives # unfriendly errors for these cases, but after UTS-46 normalization because # it can insert periods and hyphens (from fullwidth characters). - # (RFC 952, RFC 2822 3.2.4) + # (RFC 952, RFC 1123 2.1, RFC 5322 3.2.3) check_dot_atom(domain, 'An email address cannot have a {} immediately after the @-sign.', 'An email address cannot end with a {}.', is_hostname=True) # Check for RFC 5890's invalid R-LDH labels, which are labels that start diff --git a/tests/test_syntax.py b/tests/test_syntax.py index 1c5f0c4..707d0e8 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -306,6 +306,7 @@ def test_domain_literal(): ('my\n@example.com', 'The email address contains invalid characters before the @-sign: U+000A.'), ('test@\n', 'The part after the @-sign contains invalid characters: U+000A.'), ('bad"quotes"@example.com', 'The email address contains invalid characters before the @-sign: \'"\'.'), + ('obsolete."quoted".atom@example.com', 'The email address contains invalid characters before the @-sign: \'"\'.'), ('11111111112222222222333333333344444444445555555555666666666677777@example.com', 'The email address is too long before the @-sign (1 character too many).'), ('111111111122222222223333333333444444444455555555556666666666777777@example.com', 'The email address is too long before the @-sign (2 characters too many).'), ('me@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.111111111122222222223333333333444444444455555555556.com', 'The email address is too long (4 characters too many).'), From bfa538fe4c90bea65881e1238eaaa895bd82e892 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sat, 15 Apr 2023 09:41:01 -0400 Subject: [PATCH 093/174] Emit a DeprecationWarning for the old dict-like access to the return value of validate_email and remove some internal uses of it --- CHANGELOG.md | 1 + email_validator/exceptions_types.py | 2 ++ email_validator/validate_email.py | 2 +- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f857b28..a6ad16f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ There are no significant changes to which email addresses are considered valid/i * The tests have been reorganized and expanded. Deliverability tests now mostly use captured DNS responses so they can be run off-line. * The __main__ tool now reads options to validate_email from environment variables. * Type annotations have been added to the exported methods and the ValidatedEmail class and some internal methods. +* The old dict-like pattern for the return value of validate_email is deprecated. Version 1.3.1 (January 21, 2023) -------------------------------- diff --git a/email_validator/exceptions_types.py b/email_validator/exceptions_types.py index d623301..978abbc 100644 --- a/email_validator/exceptions_types.py +++ b/email_validator/exceptions_types.py @@ -1,3 +1,4 @@ +import warnings from typing import Optional @@ -72,6 +73,7 @@ def __repr__(self): """For backwards compatibility, some fields are also exposed through a dict-like interface. Note that some of the names changed when they became attributes.""" def __getitem__(self, key): + warnings.warn("dict-like access to the return value of validate_email is deprecated and may not be supported in the future.", DeprecationWarning, stacklevel=2) if key == "email": return self.email if key == "email_ascii": diff --git a/email_validator/validate_email.py b/email_validator/validate_email.py index 4ce11cb..bdbffc8 100644 --- a/email_validator/validate_email.py +++ b/email_validator/validate_email.py @@ -173,7 +173,7 @@ def validate_email( # Lazy load `deliverability` as it is slow to import (due to dns.resolver) from .deliverability import validate_email_deliverability deliverability_info = validate_email_deliverability( - ret["domain"], ret["domain_i18n"], timeout, dns_resolver + ret.ascii_domain, ret.domain, timeout, dns_resolver ) for key, value in deliverability_info.items(): setattr(ret, key, value) From 99e51228e2d8d4b2befb3b803d895bbece1df84c Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sat, 15 Apr 2023 09:55:58 -0400 Subject: [PATCH 094/174] Rename the `email` field of ValidatedEmail to `normalized` to be clearer about its importance --- CHANGELOG.md | 1 + README.md | 38 +++++++++++++------------- email_validator/exceptions_types.py | 18 +++++++++---- email_validator/validate_email.py | 16 +++++------ tests/test_main.py | 6 ++--- tests/test_syntax.py | 42 ++++++++++++++++------------- 6 files changed, 66 insertions(+), 55 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a6ad16f..a2a898b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ There are no significant changes to which email addresses are considered valid/i * Some syntax error messages have changed because they are now checked explicitly rather than as a part of other checks. * The quoted-string local part syntax (e.g. multiple @-signs, spaces, etc. if surrounded by quotes) and domain-literal addresses (e.g. @[192.XXX...] or @[IPv6:...]) are now parsed but not considered valid by default. Better error messages are now given for these addresses since it can be confusing for a technically valid address to be rejected, and new allow_quoted_local and allow_domain_literal options are added to allow these addresses if you really need them. * Some other error messages have changed to not repeat the email address in the error message. +* The `email` field on the returned `ValidatedEmail` object has been renamed to `normalized` to be clearer about its importance, but access via `.email` is also still supported. * The library has been reorganized internally into smaller modules. * The tests have been reorganized and expanded. Deliverability tests now mostly use captured DNS responses so they can be run off-line. * The __main__ tool now reads options to validate_email from environment variables. diff --git a/README.md b/README.md index 57b186a..0a7db95 100644 --- a/README.md +++ b/README.md @@ -65,11 +65,11 @@ try: # Check that the email address is valid. Turn on check_deliverability # for first-time validations like on account creation pages (but not # login pages). - validation = validate_email(email, check_deliverability=False) + emailinfo = validate_email(email, check_deliverability=False) # After this point, use only the normalized form of the email address, # especially before going to a database query. - email = validation.email + email = emailinfo.normalized except EmailNotValidError as e: @@ -158,7 +158,7 @@ from email_validator import validate_email, caching_resolver resolver = caching_resolver(timeout=10) while True: - email = validate_email(email, dns_resolver=resolver).email + validate_email(email, dns_resolver=resolver) ``` ### Test addresses @@ -249,8 +249,8 @@ This library gives you back the ASCII-ized form in the `ascii_email` field in the returned object, which you can get like this: ```python -valid = validate_email(email, allow_smtputf8=False) -email = valid.ascii_email +emailinfo = validate_email(email, allow_smtputf8=False) +email = emailinfo.ascii_email ``` The local part is left alone (if it has internationalized characters @@ -266,7 +266,7 @@ Normalization The use of Unicode in email addresses introduced a normalization problem. Different Unicode strings can look identical and have the same -semantic meaning to the user. The `email` field returned on successful +semantic meaning to the user. The `normalized` field returned on successful validation provides the correctly normalized form of the given email address. @@ -275,9 +275,9 @@ equivalent in domain names to their ASCII counterparts. This library normalizes them to their ASCII counterparts: ```python -valid = validate_email("me@Domain.com") -print(valid.email) -print(valid.ascii_email) +emailinfo = validate_email("me@Domain.com") +print(emailinfo.normalized) +print(emailinfo.ascii_email) # prints "me@domain.com" twice ``` @@ -321,7 +321,7 @@ For the email address `test@joshdata.me`, the returned object is: ```python ValidatedEmail( - email='test@joshdata.me', + normalized='test@joshdata.me', local_part='test', domain='joshdata.me', ascii_email='test@joshdata.me', @@ -335,7 +335,7 @@ internationalized domain but ASCII local part, the returned object is: ```python ValidatedEmail( - email='example@ツ.life', + normalized='example@ツ.life', local_part='example', domain='ツ.life', ascii_email='example@xn--bdk.life', @@ -345,20 +345,20 @@ ValidatedEmail( ``` -Note that the `email` and `domain` fields provide a normalized form of the +Note that `normalized` and other fields provide a normalized form of the email address, domain name, and (in other cases) local part (see earlier discussion of normalization), which you should use in your database. Calling `validate_email` with the ASCII form of the above email address, `example@xn--bdk.life`, returns the exact same information (i.e., the -`email` field always will contain Unicode characters, not Punycode). +`normalized` field always will contain Unicode characters, not Punycode). For the fictitious address `ツ-test@joshdata.me`, which has an internationalized local part, the returned object is: ```python ValidatedEmail( - email='ツ-test@joshdata.me', + normalized='ツ-test@joshdata.me', local_part='ツ-test', domain='joshdata.me', ascii_email=None, @@ -368,10 +368,8 @@ ValidatedEmail( ``` Now `smtputf8` is `True` and `ascii_email` is `None` because the local -part of the address is internationalized. The `local_part` and `email` fields -return the normalized form of the address: certain Unicode characters -(such as angstrom and ohm) may be replaced by other equivalent code -points (a-with-ring and omega). +part of the address is internationalized. The `local_part` and `normalized` fields +return the normalized form of the address. Return value ------------ @@ -381,8 +379,8 @@ are: | Field | Value | | -----:|-------| -| `email` | The normalized form of the email address that you should put in your database. This combines the `local_part` and `domain` fields (see below). | -| `ascii_email` | If set, an ASCII-only form of the email address by replacing the domain part with [IDNA](https://tools.ietf.org/html/rfc5891) [Punycode](https://www.rfc-editor.org/rfc/rfc3492.txt). This field will be present when an ASCII-only form of the email address exists (including if the email address is already ASCII). If the local part of the email address contains internationalized characters, `ascii_email` will be `None`. If set, it merely combines `ascii_local_part` and `ascii_domain`. | +| `normalized` | The normalized form of the email address that you should put in your database. This combines the `local_part` and `domain` fields (see below). | +| `ascii_email` | If set, an ASCII-only form of the normalized email address by replacing the domain part with [IDNA](https://tools.ietf.org/html/rfc5891) [Punycode](https://www.rfc-editor.org/rfc/rfc3492.txt). This field will be present when an ASCII-only form of the email address exists (including if the email address is already ASCII). If the local part of the email address contains internationalized characters, `ascii_email` will be `None`. If set, it merely combines `ascii_local_part` and `ascii_domain`. | | `local_part` | The normalized local part of the given email address (before the @-sign). Normalization includes Unicode NFC normalization and removing unnecessary quoted-string quotes and backslashes. If `allow_quoted_local` is True and the surrounding quotes are necessary, the quotes _will_ be present in this field. | | `ascii_local_part` | If set, the local part, which is composed of ASCII characters only. | | `domain` | The canonical internationalized Unicode form of the domain part of the email address. If the returned string contains non-ASCII characters, either the [SMTPUTF8](https://tools.ietf.org/html/rfc6531) feature of your mail relay will be required to transmit the message or else the email address's domain part must be converted to IDNA ASCII first: Use `ascii_domain` field instead. | diff --git a/email_validator/exceptions_types.py b/email_validator/exceptions_types.py index 978abbc..9a1b331 100644 --- a/email_validator/exceptions_types.py +++ b/email_validator/exceptions_types.py @@ -22,13 +22,13 @@ class ValidatedEmail(object): and other information.""" """The email address that was passed to validate_email. (If passed as bytes, this will be a string.)""" - original_email: str + original: str """The normalized email address, which should always be used in preferance to the original address. The normalized address converts an IDNA ASCII domain name to Unicode, if possible, and performs Unicode normalization on the local part and on the domain (if originally Unicode). It is the concatenation of the local_part and domain attributes, separated by an @-sign.""" - email: str + normalized: str """The local part of the email address after Unicode normalization.""" local_part: str @@ -68,14 +68,22 @@ def __init__(self, **kwargs): setattr(self, k, v) def __repr__(self): - return f"" + return f"" + + """For backwards compatibility, support old field names.""" + def __getattr__(self, key): + if key == "original_email": + return self.original + if key == "email": + return self.normalized + raise AttributeError() """For backwards compatibility, some fields are also exposed through a dict-like interface. Note that some of the names changed when they became attributes.""" def __getitem__(self, key): warnings.warn("dict-like access to the return value of validate_email is deprecated and may not be supported in the future.", DeprecationWarning, stacklevel=2) if key == "email": - return self.email + return self.normalized if key == "email_ascii": return self.ascii_email if key == "local": @@ -97,7 +105,7 @@ def __eq__(self, other): if not isinstance(other, ValidatedEmail): return False return ( - self.email == other.email + self.normalized == other.normalized and self.local_part == other.local_part and self.domain == other.domain and getattr(self, 'ascii_email', None) == getattr(other, 'ascii_email', None) diff --git a/email_validator/validate_email.py b/email_validator/validate_email.py index bdbffc8..6114931 100644 --- a/email_validator/validate_email.py +++ b/email_validator/validate_email.py @@ -76,7 +76,7 @@ def validate_email( # Collect return values in this instance. ret = ValidatedEmail() - ret.original_email = email + ret.original = email # Validate the email address's local part syntax and get a normalized form. # If the original address was quoted and the decoded local part is a valid @@ -113,7 +113,7 @@ def validate_email( ret.ascii_domain = domain_part_info["ascii_domain"] # Construct the complete normalized form. - ret.email = ret.local_part + "@" + ret.domain + ret.normalized = ret.local_part + "@" + ret.domain # If the email address has an ASCII form, add it. if not ret.smtputf8: @@ -144,20 +144,20 @@ def validate_email( # # See the length checks on the local part and the domain. if ret.ascii_email and len(ret.ascii_email) > EMAIL_MAX_LENGTH: - if ret.ascii_email == ret.email: + if ret.ascii_email == ret.normalized: reason = get_length_reason(ret.ascii_email) - elif len(ret.email) > EMAIL_MAX_LENGTH: + elif len(ret.normalized) > EMAIL_MAX_LENGTH: # If there are more than 254 characters, then the ASCII # form is definitely going to be too long. - reason = get_length_reason(ret.email, utf8=True) + reason = get_length_reason(ret.normalized, utf8=True) else: reason = "(when converted to IDNA ASCII)" raise EmailSyntaxError(f"The email address is too long {reason}.") - if len(ret.email.encode("utf8")) > EMAIL_MAX_LENGTH: - if len(ret.email) > EMAIL_MAX_LENGTH: + if len(ret.normalized.encode("utf8")) > EMAIL_MAX_LENGTH: + if len(ret.normalized) > EMAIL_MAX_LENGTH: # If there are more than 254 characters, then the UTF-8 # encoding is definitely going to be too long. - reason = get_length_reason(ret.email, utf8=True) + reason = get_length_reason(ret.normalized, utf8=True) else: reason = "(when encoded in bytes)" raise EmailSyntaxError(f"The email address is too long {reason}.") diff --git a/tests/test_main.py b/tests/test_main.py index 34005da..e32af94 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -13,7 +13,7 @@ def test_dict_accessor(): input_email = "testaddr@example.tld" valid_email = validate_email(input_email, check_deliverability=False) assert isinstance(valid_email.as_dict(), dict) - assert valid_email.as_dict()["original_email"] == input_email + assert valid_email.as_dict()["original"] == input_email def test_main_single_good_input(monkeypatch, capsys): @@ -24,7 +24,7 @@ def test_main_single_good_input(monkeypatch, capsys): stdout, _ = capsys.readouterr() output = json.loads(str(stdout)) assert isinstance(output, dict) - assert validate_email(test_email, dns_resolver=RESOLVER).original_email == output["original_email"] + assert validate_email(test_email, dns_resolver=RESOLVER).original == output["original"] def test_main_single_bad_input(monkeypatch, capsys): @@ -53,7 +53,7 @@ def test_bytes_input(): input_email = b"testaddr@example.tld" valid_email = validate_email(input_email, check_deliverability=False) assert isinstance(valid_email.as_dict(), dict) - assert valid_email.as_dict()["email"] == input_email.decode("utf8") + assert valid_email.as_dict()["normalized"] == input_email.decode("utf8") input_email = "testaddr中example.tld".encode("utf32") with pytest.raises(EmailSyntaxError): diff --git a/tests/test_syntax.py b/tests/test_syntax.py index 707d0e8..d58b9b4 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -16,7 +16,7 @@ smtputf8=False, ascii_domain='example.tld', domain='example.tld', - email='Abc@example.tld', + normalized='Abc@example.tld', ascii_email='Abc@example.tld', ), ), @@ -28,7 +28,7 @@ smtputf8=False, ascii_domain='test-example.com', domain='test-example.com', - email='Abc.123@test-example.com', + normalized='Abc.123@test-example.com', ascii_email='Abc.123@test-example.com', ), ), @@ -40,7 +40,7 @@ smtputf8=False, ascii_domain='example.tld', domain='example.tld', - email='user+mailbox/department=shipping@example.tld', + normalized='user+mailbox/department=shipping@example.tld', ascii_email='user+mailbox/department=shipping@example.tld', ), ), @@ -52,7 +52,7 @@ smtputf8=False, ascii_domain='example.tld', domain='example.tld', - email="!#$%&'*+-/=?^_`.{|}~@example.tld", + normalized="!#$%&'*+-/=?^_`.{|}~@example.tld", ascii_email="!#$%&'*+-/=?^_`.{|}~@example.tld", ), ), @@ -64,7 +64,7 @@ smtputf8=False, ascii_domain='xn--fiqq24b10vi0d.tw', domain='臺網中心.tw', - email='jeff@臺網中心.tw', + normalized='jeff@臺網中心.tw', ascii_email='jeff@xn--fiqq24b10vi0d.tw', ), ), @@ -74,9 +74,13 @@ def test_email_valid(email_input, output): # These addresses do not require SMTPUTF8. See test_email_valid_intl_local_part # for addresses that are valid but require SMTPUTF8. Check that it passes with # allow_smtput8 both on and off. - assert validate_email(email_input, check_deliverability=False, allow_smtputf8=False) == output + emailinfo = validate_email(email_input, check_deliverability=False, allow_smtputf8=False) + assert emailinfo == output assert validate_email(email_input, check_deliverability=False, allow_smtputf8=True) == output + # Check that the old way to access the normalized form still works. + assert emailinfo.email == emailinfo.normalized + @pytest.mark.parametrize( 'email_input,output', @@ -88,7 +92,7 @@ def test_email_valid(email_input, output): smtputf8=True, ascii_domain='xn--5nqv22n.xn--lhr59c', domain='郵件.商務', - email='伊昭傑@郵件.商務', + normalized='伊昭傑@郵件.商務', ), ), ( @@ -98,7 +102,7 @@ def test_email_valid(email_input, output): smtputf8=True, ascii_domain='xn--l2bl7a9d.xn--o1b8dj2ki', domain='मोहन.ईन्फो', - email='राम@मोहन.ईन्फो', + normalized='राम@मोहन.ईन्फो', ), ), ( @@ -108,7 +112,7 @@ def test_email_valid(email_input, output): smtputf8=True, ascii_domain='xn--80ajglhfv.xn--j1aef', domain='екзампл.ком', - email='юзер@екзампл.ком', + normalized='юзер@екзампл.ком', ), ), ( @@ -118,7 +122,7 @@ def test_email_valid(email_input, output): smtputf8=True, ascii_domain='xn--mxahbxey0c.xn--xxaf0a', domain='εχαμπλε.ψομ', - email='θσερ@εχαμπλε.ψομ', + normalized='θσερ@εχαμπλε.ψομ', ), ), ( @@ -128,7 +132,7 @@ def test_email_valid(email_input, output): smtputf8=True, ascii_domain='xn--fiqq24b10vi0d.tw', domain='臺網中心.tw', - email='葉士豪@臺網中心.tw', + normalized='葉士豪@臺網中心.tw', ), ), ( @@ -138,7 +142,7 @@ def test_email_valid(email_input, output): smtputf8=True, ascii_domain='xn--fiqq24b10vi0d.xn--kpry57d', domain='臺網中心.台灣', - email='葉士豪@臺網中心.台灣', + normalized='葉士豪@臺網中心.台灣', ), ), ( @@ -148,7 +152,7 @@ def test_email_valid(email_input, output): smtputf8=True, ascii_domain='xn--fiqq24b10vi0d.tw', domain='臺網中心.tw', - email='jeff葉@臺網中心.tw', + normalized='jeff葉@臺網中心.tw', ), ), ( @@ -158,7 +162,7 @@ def test_email_valid(email_input, output): smtputf8=True, ascii_domain='example.tld', domain='example.tld', - email='ñoñó@example.tld', + normalized='ñoñó@example.tld', ), ), ( @@ -168,7 +172,7 @@ def test_email_valid(email_input, output): smtputf8=True, ascii_domain='example.tld', domain='example.tld', - email='我買@example.tld', + normalized='我買@example.tld', ), ), ( @@ -178,7 +182,7 @@ def test_email_valid(email_input, output): smtputf8=True, ascii_domain='example.tld', domain='example.tld', - email='甲斐黒川日本@example.tld', + normalized='甲斐黒川日本@example.tld', ), ), ( @@ -188,7 +192,7 @@ def test_email_valid(email_input, output): smtputf8=True, ascii_domain='example.tld', domain='example.tld', - email='чебурашкаящик-с-апельсинами.рф@example.tld', + normalized='чебурашкаящик-с-апельсинами.рф@example.tld', ), ), ( @@ -198,7 +202,7 @@ def test_email_valid(email_input, output): smtputf8=True, ascii_domain='domain.with.idn.tld', domain='domain.with.idn.tld', - email='उदाहरण.परीक्ष@domain.with.idn.tld', + normalized='उदाहरण.परीक्ष@domain.with.idn.tld', ), ), ( @@ -208,7 +212,7 @@ def test_email_valid(email_input, output): smtputf8=True, ascii_domain='xn--qxaa9ba.gr', domain='εεττ.gr', - email='ιωάννης@εεττ.gr', + normalized='ιωάννης@εεττ.gr', ), ), ], From 83e8cedb3fedbe67474a1e69177e5a1208705524 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sat, 15 Apr 2023 10:32:17 -0400 Subject: [PATCH 095/174] Normalize some mailbox names like postmaster to lowercase per RFC 2142 --- CHANGELOG.md | 1 + README.md | 3 ++- email_validator/rfc_constants.py | 7 +++++++ email_validator/validate_email.py | 11 ++++++++++- tests/test_syntax.py | 5 +++++ 5 files changed, 25 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a2a898b..23b5a53 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ There are no significant changes to which email addresses are considered valid/i * The quoted-string local part syntax (e.g. multiple @-signs, spaces, etc. if surrounded by quotes) and domain-literal addresses (e.g. @[192.XXX...] or @[IPv6:...]) are now parsed but not considered valid by default. Better error messages are now given for these addresses since it can be confusing for a technically valid address to be rejected, and new allow_quoted_local and allow_domain_literal options are added to allow these addresses if you really need them. * Some other error messages have changed to not repeat the email address in the error message. * The `email` field on the returned `ValidatedEmail` object has been renamed to `normalized` to be clearer about its importance, but access via `.email` is also still supported. +* Some mailbox names like `postmaster` are now normalized to lowercase per RFC 2142. * The library has been reorganized internally into smaller modules. * The tests have been reorganized and expanded. Deliverability tests now mostly use captured DNS responses so they can be run off-line. * The __main__ tool now reads options to validate_email from environment variables. diff --git a/README.md b/README.md index 0a7db95..60077ca 100644 --- a/README.md +++ b/README.md @@ -312,7 +312,8 @@ literal IPv6 addresses if you have allowed them by the `allow_quoted_local` and `allow_domain_literal` options. In quoted-string local parts, unnecessary backslash escaping is removed and even the surrounding quotes are removed if they are unnecessary. For IPv6 domain literals, the IPv6 address is -normalized to condensed form. +normalized to condensed form. [RFC 2142](https://datatracker.ietf.org/doc/html/rfc2142) +also requires lowercase normalization for some specific mailbox names like `postmaster@`. Examples -------- diff --git a/email_validator/rfc_constants.py b/email_validator/rfc_constants.py index 96816df..d5961d6 100644 --- a/email_validator/rfc_constants.py +++ b/email_validator/rfc_constants.py @@ -43,3 +43,10 @@ LOCAL_PART_MAX_LENGTH = 64 DNS_LABEL_LENGTH_LIMIT = 63 # in "octets", RFC 1035 2.3.1 DOMAIN_MAX_LENGTH = 255 # in "octets", RFC 1035 2.3.4 and RFC 5321 4.5.3.1.2 + +# RFC 2142 +CASE_INSENSITIVE_MAILBOX_NAMES = [ + 'info', 'marking', 'sales', 'support', # section 3 + 'abuse', 'noc', 'security', # section 4 + 'postmaster', 'hostmaster', 'usenet', 'news', 'webmaster', 'www', 'uucp', 'ftp', # section 5 +] diff --git a/email_validator/validate_email.py b/email_validator/validate_email.py index 6114931..cbd60cc 100644 --- a/email_validator/validate_email.py +++ b/email_validator/validate_email.py @@ -2,7 +2,7 @@ from .exceptions_types import EmailSyntaxError, ValidatedEmail from .syntax import validate_email_local_part, validate_email_domain_name, validate_email_domain_literal, get_length_reason -from .rfc_constants import EMAIL_MAX_LENGTH, QUOTED_LOCAL_PART_ADDR +from .rfc_constants import EMAIL_MAX_LENGTH, QUOTED_LOCAL_PART_ADDR, CASE_INSENSITIVE_MAILBOX_NAMES def validate_email( @@ -92,6 +92,15 @@ def validate_email( ret.ascii_local_part = local_part_info["ascii_local_part"] ret.smtputf8 = local_part_info["smtputf8"] + # Some local parts are required to be case-insensitive, so we should normalize + # to lowercase. + # RFC 2142 + if ret.ascii_local_part is not None \ + and ret.ascii_local_part.lower() in CASE_INSENSITIVE_MAILBOX_NAMES \ + and ret.local_part is not None: + ret.ascii_local_part = ret.ascii_local_part.lower() + ret.local_part = ret.local_part.lower() + # Validate the email address's domain part syntax and get a normalized form. is_domain_literal = False if len(domain_part) == 0: diff --git a/tests/test_syntax.py b/tests/test_syntax.py index d58b9b4..57510e5 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -417,6 +417,11 @@ def test_email_test_domain_name_in_test_environment(): validate_email("anything@mycompany.test", test_environment=True) +def test_case_insensitive_mailbox_name(): + validate_email("POSTMASTER@test", test_environment=True).normalized = "postmaster@test" + validate_email("NOT-POSTMASTER@test", test_environment=True).normalized = "NOT-POSTMASTER@test" + + # This is the pyIsEmail (https://github.com/michaelherold/pyIsEmail) test suite. # # The test data was extracted by: From f18da74b3f6ca3f09315031949928e1d64d56455 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sat, 15 Apr 2023 21:10:33 -0400 Subject: [PATCH 096/174] 2.0.0 --- CHANGELOG.md | 8 +++----- setup.cfg | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 23b5a53..4a28f88 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,7 @@ -2.0.0-dev4 ----------- +2.0.0 (April 15, 2023) +---------------------- -This is a pre-release for version 2.0.0. - -There are no significant changes to which email addresses are considered valid/invalid with default options, but there are many changes in error messages and internal improvements to the library including the addition of type annotations. New options are added to allow quoted-string local parts and domain-literal addresses, but they are off by default. And Python 3.7+ is now required. +This is a major update to the library, but since email address specs haven't changed there should be no significant changes to which email addresses are considered valid or invalid with default options. There are new options for accepting unusual email addresses that were previously always rejected, some changes to how DNS errors are handled, many changes in error message text, and major internal improvements including the addition of type annotations. Python 3.7+ is now required. Details follow: * Python 2.x and 3.x versions through 3.6, and dnspython 1.x, are no longer supported. Python 3.7+ with dnspython 2.x are now required. * The dnspython package is no longer required if DNS checks are not used, although it will install automatically. diff --git a/setup.cfg b/setup.cfg index 60ff626..04495f4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = email_validator -version = 2.0.0-dev4 +version = 2.0.0 description = A robust email address syntax and deliverability validation library. long_description = file: README.md long_description_content_type = text/markdown From e190635bf77cef0dededd3abd380dccfde363374 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20G=C3=B3rny?= Date: Sun, 16 Apr 2023 15:08:01 +0200 Subject: [PATCH 097/174] Include missing test files in sdist (#105) Include the missing `mocked_dns_response.py` and `mocked-dns-answers.json` files in sdist archives, as they are necessary to run the test suite. --- MANIFEST.in | 1 + 1 file changed, 1 insertion(+) diff --git a/MANIFEST.in b/MANIFEST.in index 2f9bf23..f4af13f 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,3 @@ include email_validator.py include LICENSE README.md +recursive-include tests *.json *.py From 8252742d59f3c34cd490abbfd405204be318aca3 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Sun, 16 Apr 2023 15:09:16 +0200 Subject: [PATCH 098/174] Fix entry point location for standalone email_validator script (#106) --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 04495f4..aa44b93 100644 --- a/setup.cfg +++ b/setup.cfg @@ -34,7 +34,7 @@ python_requires = >=3.7 [options.entry_points] console_scripts = - email_validator=email_validator:main + email_validator=email_validator.__main__:main [flake8] max-line-length = 120 From 2acd7d5d9c5ebba2389bbcfd58dcd742b856ccdf Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sun, 16 Apr 2023 09:11:15 -0400 Subject: [PATCH 099/174] Remove old obsolete line from MANIFEST.in for email_validator.py --- MANIFEST.in | 1 - 1 file changed, 1 deletion(-) diff --git a/MANIFEST.in b/MANIFEST.in index f4af13f..b22c457 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,2 @@ -include email_validator.py include LICENSE README.md recursive-include tests *.json *.py From a83c3d662423a0da4036c0b08dd66b261584007d Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sun, 16 Apr 2023 09:09:51 -0400 Subject: [PATCH 100/174] 2.0.0.post1 --- CHANGELOG.md | 2 ++ setup.cfg | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4a28f88..8baf205 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,8 @@ This is a major update to the library, but since email address specs haven't cha * Type annotations have been added to the exported methods and the ValidatedEmail class and some internal methods. * The old dict-like pattern for the return value of validate_email is deprecated. +Version 2.0.0.post1 corrects some packaging issues. + Version 1.3.1 (January 21, 2023) -------------------------------- diff --git a/setup.cfg b/setup.cfg index aa44b93..e839e48 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = email_validator -version = 2.0.0 +version = 2.0.0.post1 description = A robust email address syntax and deliverability validation library. long_description = file: README.md long_description_content_type = text/markdown From beef710d5092113a8fdd9ec83d69928447e5bad0 Mon Sep 17 00:00:00 2001 From: Atte Lautanala Date: Thu, 20 Apr 2023 00:01:03 +0300 Subject: [PATCH 101/174] Add py.typed marker file (#107) The file is needed so that static type validators, like `mypy`, can find type annotations from this package. The marker file is described in [PEP 561]. [PEP 561]: https://peps.python.org/pep-0561/ --- email_validator/py.typed | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 email_validator/py.typed diff --git a/email_validator/py.typed b/email_validator/py.typed new file mode 100644 index 0000000..e69de29 From 3ea13089194f4803b82f22d4f838630752517b3a Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Wed, 22 Mar 2023 18:04:09 -0400 Subject: [PATCH 102/174] Add an exception if timeout and dns_resolver are both passed --- email_validator/deliverability.py | 2 ++ email_validator/validate_email.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/email_validator/deliverability.py b/email_validator/deliverability.py index f787bb3..4846091 100644 --- a/email_validator/deliverability.py +++ b/email_validator/deliverability.py @@ -31,6 +31,8 @@ def validate_email_deliverability(domain: str, domain_i18n: str, timeout: Option timeout = DEFAULT_TIMEOUT dns_resolver = dns.resolver.get_default_resolver() dns_resolver.lifetime = timeout + elif timeout is not None: + raise ValueError("It's not valid to pass both timeout and dns_resolver.") deliverability_info: Dict[str, Any] = {} diff --git a/email_validator/validate_email.py b/email_validator/validate_email.py index cbd60cc..0d8f581 100644 --- a/email_validator/validate_email.py +++ b/email_validator/validate_email.py @@ -40,7 +40,7 @@ def validate_email( test_environment = TEST_ENVIRONMENT if globally_deliverable is None: globally_deliverable = GLOBALLY_DELIVERABLE - if timeout is None: + if timeout is None and dns_resolver is None: timeout = DEFAULT_TIMEOUT # Allow email to be a str or bytes instance. If bytes, From 1b3e4c669499cc966c55bc87a11d2cc0a633f726 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Wed, 19 Apr 2023 17:04:26 -0400 Subject: [PATCH 103/174] 2.0.0.post2 --- CHANGELOG.md | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8baf205..48a31c6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,7 +17,7 @@ This is a major update to the library, but since email address specs haven't cha * Type annotations have been added to the exported methods and the ValidatedEmail class and some internal methods. * The old dict-like pattern for the return value of validate_email is deprecated. -Version 2.0.0.post1 corrects some packaging issues. +Versions 2.0.0.post1 and 2.0.0.post2 corrected some packaging issues. 2.0.0.post2 also added a check for an invalid combination of arguments. Version 1.3.1 (January 21, 2023) -------------------------------- diff --git a/setup.cfg b/setup.cfg index e839e48..d299498 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = email_validator -version = 2.0.0.post1 +version = 2.0.0.post2 description = A robust email address syntax and deliverability validation library. long_description = file: README.md long_description_content_type = text/markdown From 68b9d1892dc6007844a80acf5677cf6166bf5533 Mon Sep 17 00:00:00 2001 From: commonism Date: Mon, 5 Jun 2023 16:38:36 +0200 Subject: [PATCH 104/174] Exposing a version str & deprecating the email attribute (#110) * expose version as __version__ * assist ValidatedEmail.email deprecation --- email_validator/__init__.py | 4 ++-- email_validator/exceptions_types.py | 7 +++++++ email_validator/version.py | 1 + setup.cfg | 2 +- tests/test_main.py | 7 +++++++ 5 files changed, 18 insertions(+), 3 deletions(-) create mode 100644 email_validator/version.py diff --git a/email_validator/__init__.py b/email_validator/__init__.py index d5f26a2..c3b5929 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -4,12 +4,12 @@ from .exceptions_types import ValidatedEmail, EmailNotValidError, \ EmailSyntaxError, EmailUndeliverableError from .validate_email import validate_email - +from .version import __version__ __all__ = ["validate_email", "ValidatedEmail", "EmailNotValidError", "EmailSyntaxError", "EmailUndeliverableError", - "caching_resolver"] + "caching_resolver", "__version__"] def caching_resolver(*args, **kwargs): diff --git a/email_validator/exceptions_types.py b/email_validator/exceptions_types.py index 9a1b331..ee9d50a 100644 --- a/email_validator/exceptions_types.py +++ b/email_validator/exceptions_types.py @@ -78,6 +78,13 @@ def __getattr__(self, key): return self.normalized raise AttributeError() + @property + def email(self): + import warnings + warnings.warn("ValidatedEmail.email is deprecated and will be removed, use ValidatedEmail.normalized instead", DeprecationWarning) + return self.normalized + + """For backwards compatibility, some fields are also exposed through a dict-like interface. Note that some of the names changed when they became attributes.""" def __getitem__(self, key): diff --git a/email_validator/version.py b/email_validator/version.py new file mode 100644 index 0000000..7857319 --- /dev/null +++ b/email_validator/version.py @@ -0,0 +1 @@ +__version__ = "2.0.0.post2" \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index d299498..dc97892 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = email_validator -version = 2.0.0.post2 +version = attr: email_validator.version.__version__ description = A robust email address syntax and deliverability validation library. long_description = file: README.md long_description_content_type = text/markdown diff --git a/tests/test_main.py b/tests/test_main.py index e32af94..dc01bc1 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -58,3 +58,10 @@ def test_bytes_input(): input_email = "testaddr中example.tld".encode("utf32") with pytest.raises(EmailSyntaxError): validate_email(input_email, check_deliverability=False) + + +def test_deprecation(): + input_email = b"testaddr@example.tld" + valid_email = validate_email(input_email, check_deliverability=False) + with pytest.raises(DeprecationWarning): + assert valid_email.email is not None \ No newline at end of file From 5abaa7b4ce6677e5a2217db2e52202a760de3c24 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Mon, 5 Jun 2023 10:50:30 -0400 Subject: [PATCH 105/174] Fix tests and add CHANGLOG entry for last commit --- CHANGELOG.md | 6 ++++++ README.md | 2 +- email_validator/exceptions_types.py | 1 - email_validator/version.py | 2 +- tests/test_main.py | 2 +- tests/test_syntax.py | 8 ++++++-- 6 files changed, 15 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 48a31c6..d7b02db 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +In Development +-------------- + +* The old `email` field on the returned `ValidatedEmail` object, which in the previous version was superseded by `normalized`, will now raise a deprecation warning if used. See https://stackoverflow.com/q/879173 for strategies to suppress the DeprecationWarning. +* A `__version__` module attribute is added. + 2.0.0 (April 15, 2023) ---------------------- diff --git a/README.md b/README.md index 60077ca..72b3b0f 100644 --- a/README.md +++ b/README.md @@ -436,7 +436,7 @@ The package is distributed as a universal wheel and as a source package. To release: * Update CHANGELOG.md. -* Update the version number in setup.cfg. +* Update the version number in `email_validator/version.py`. * Make & push a commit with the new version number and make sure tests pass. * Make & push a tag (see command below). * Make a release at https://github.com/JoshData/python-email-validator/releases/new. diff --git a/email_validator/exceptions_types.py b/email_validator/exceptions_types.py index ee9d50a..4b8f200 100644 --- a/email_validator/exceptions_types.py +++ b/email_validator/exceptions_types.py @@ -84,7 +84,6 @@ def email(self): warnings.warn("ValidatedEmail.email is deprecated and will be removed, use ValidatedEmail.normalized instead", DeprecationWarning) return self.normalized - """For backwards compatibility, some fields are also exposed through a dict-like interface. Note that some of the names changed when they became attributes.""" def __getitem__(self, key): diff --git a/email_validator/version.py b/email_validator/version.py index 7857319..80476c8 100644 --- a/email_validator/version.py +++ b/email_validator/version.py @@ -1 +1 @@ -__version__ = "2.0.0.post2" \ No newline at end of file +__version__ = "2.0.0.post2" diff --git a/tests/test_main.py b/tests/test_main.py index dc01bc1..49a3a77 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -64,4 +64,4 @@ def test_deprecation(): input_email = b"testaddr@example.tld" valid_email = validate_email(input_email, check_deliverability=False) with pytest.raises(DeprecationWarning): - assert valid_email.email is not None \ No newline at end of file + assert valid_email.email is not None diff --git a/tests/test_syntax.py b/tests/test_syntax.py index 57510e5..1c9659c 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -78,8 +78,12 @@ def test_email_valid(email_input, output): assert emailinfo == output assert validate_email(email_input, check_deliverability=False, allow_smtputf8=True) == output - # Check that the old way to access the normalized form still works. - assert emailinfo.email == emailinfo.normalized + # Check that the old `email` attribute to access the normalized form still works + # if the DeprecationWarning is suppressed. + import warnings + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=DeprecationWarning) + assert emailinfo.email == emailinfo.normalized @pytest.mark.parametrize( From dbf4618d62d9403eb5bebd00f869abebcda0d0e2 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Tue, 26 Sep 2023 06:39:42 -0400 Subject: [PATCH 106/174] Drop Python 3.7 and update GitHub Actions to build with Python 3.12.0-rc.3 flake8's latest version for Python 3.7 failed to parse f-strings correctly when running on Python 3.12, giving: --ignore=E501,E126,W503 email_validator tests email_validator/syntax.py:30:24: E231 missing whitespace after ':' email_validator/syntax.py:32:24: E231 missing whitespace after ':' ``` By dropping 3.7 we can update flake8 to work on all other versions. See https://github.com/actions/python-versions/releases for the Python versions supported in GitHub Actions. --- .github/workflows/test_and_build.yaml | 2 +- setup.cfg | 2 +- test_requirements.txt | 30 ++++++++++++--------------- 3 files changed, 15 insertions(+), 19 deletions(-) diff --git a/.github/workflows/test_and_build.yaml b/.github/workflows/test_and_build.yaml index e80acd6..441d1ba 100644 --- a/.github/workflows/test_and_build.yaml +++ b/.github/workflows/test_and_build.yaml @@ -8,7 +8,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12.0-alpha.5"] + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12.0-rc.3"] steps: - uses: actions/checkout@v3 diff --git a/setup.cfg b/setup.cfg index dc97892..2394d15 100644 --- a/setup.cfg +++ b/setup.cfg @@ -14,11 +14,11 @@ classifiers = Intended Audience :: Developers License :: CC0 1.0 Universal (CC0 1.0) Public Domain Dedication Programming Language :: Python :: 3 - Programming Language :: Python :: 3.7 Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 Programming Language :: Python :: 3.10 Programming Language :: Python :: 3.11 + Programming Language :: Python :: 3.12 Topic :: Software Development :: Libraries :: Python Modules keywords = email address validator diff --git a/test_requirements.txt b/test_requirements.txt index 5f11247..b41edca 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -1,5 +1,5 @@ # This file was generated by running: -# sudo docker run --rm -it --network=host python:3.7-slim /bin/bash +# sudo docker run --rm -it --network=host python:3.8-slim /bin/bash # pip install dnspython idna # from setup.cfg # pip install pytest pytest-cov coverage flake8 mypy # pip freeze @@ -7,24 +7,20 @@ # the earliest Python version we support, and some exception # messages may depend on package versions, so we pin versions # for reproducible testing.) -attrs==22.2.0 -coverage==7.2.1 -dnspython==2.3.0 -exceptiongroup==1.1.0 -flake8==5.0.4 +coverage==7.3.1 +dnspython==2.4.2 +exceptiongroup==1.1.3 +flake8==6.1.0 idna==3.4 -importlib-metadata==4.2.0 iniconfig==2.0.0 mccabe==0.7.0 -mypy==1.0.1 +mypy==1.5.1 mypy-extensions==1.0.0 -packaging==23.0 -pluggy==1.0.0 -pycodestyle==2.9.1 -pyflakes==2.5.0 -pytest==7.2.1 -pytest-cov==4.0.0 +packaging==23.1 +pluggy==1.3.0 +pycodestyle==2.11.0 +pyflakes==3.1.0 +pytest==7.4.2 +pytest-cov==4.1.0 tomli==2.0.1 -typed-ast==1.5.4 -typing_extensions==4.5.0 -zipp==3.15.0 +typing_extensions==4.8.0 From c52aaa2d17dc28f54608df160457c926b1341f44 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Wed, 18 Oct 2023 09:52:15 -0400 Subject: [PATCH 107/174] Fix incorrect test for DeprecationWarning added in 68b9d1892dc6007844a80acf5677cf6166bf5533 --- tests/test_main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_main.py b/tests/test_main.py index 49a3a77..579163f 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -63,5 +63,5 @@ def test_bytes_input(): def test_deprecation(): input_email = b"testaddr@example.tld" valid_email = validate_email(input_email, check_deliverability=False) - with pytest.raises(DeprecationWarning): + with pytest.deprecated_call(): assert valid_email.email is not None From 3aad01998b69a1da25141fdd04f9e60e7d38085a Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Wed, 18 Oct 2023 08:06:28 -0400 Subject: [PATCH 108/174] Update GitHub Actions to build with released version of Python 3.12 --- .github/workflows/test_and_build.yaml | 2 +- CHANGELOG.md | 1 + README.md | 2 +- test_requirements.txt | 8 ++++---- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.github/workflows/test_and_build.yaml b/.github/workflows/test_and_build.yaml index 441d1ba..5268a2b 100644 --- a/.github/workflows/test_and_build.yaml +++ b/.github/workflows/test_and_build.yaml @@ -8,7 +8,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12.0-rc.3"] + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12.0"] steps: - uses: actions/checkout@v3 diff --git a/CHANGELOG.md b/CHANGELOG.md index d7b02db..4961fc5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ In Development -------------- +* Python 3.8+ is now required (support for Python 3.7 was dropped). * The old `email` field on the returned `ValidatedEmail` object, which in the previous version was superseded by `normalized`, will now raise a deprecation warning if used. See https://stackoverflow.com/q/879173 for strategies to suppress the DeprecationWarning. * A `__version__` module attribute is added. diff --git a/README.md b/README.md index 72b3b0f..1652fcf 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ email-validator: Validate Email Addresses ========================================= A robust email address syntax and deliverability validation library for -Python 3.7+ by [Joshua Tauberer](https://joshdata.me). +Python 3.8+ by [Joshua Tauberer](https://joshdata.me). This library validates that a string is of the form `name@example.com` and optionally checks that the domain name is set up to receive email. diff --git a/test_requirements.txt b/test_requirements.txt index b41edca..db9bbbd 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -7,18 +7,18 @@ # the earliest Python version we support, and some exception # messages may depend on package versions, so we pin versions # for reproducible testing.) -coverage==7.3.1 +coverage==7.3.2 dnspython==2.4.2 exceptiongroup==1.1.3 flake8==6.1.0 idna==3.4 iniconfig==2.0.0 mccabe==0.7.0 -mypy==1.5.1 +mypy==1.6.1 mypy-extensions==1.0.0 -packaging==23.1 +packaging==23.2 pluggy==1.3.0 -pycodestyle==2.11.0 +pycodestyle==2.11.1 pyflakes==3.1.0 pytest==7.4.2 pytest-cov==4.1.0 From 2a9653b8701db574e4ac69451908aef18eddab82 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sun, 2 Jul 2023 09:47:25 -0400 Subject: [PATCH 109/174] Remove .travis.yml since we now do CI with GitHub actions --- .travis.yml | 23 ----------------------- 1 file changed, 23 deletions(-) delete mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 3a283dc..0000000 --- a/.travis.yml +++ /dev/null @@ -1,23 +0,0 @@ -os: linux -dist: bionic -language: python -cache: pip - -python: -- '3.7' -- '3.8' -- '3.9' -- '3.10' -- '3.11' -- '3.12-dev' - -install: -- make install - -script: -- make typing -- make lint -- make test - -after_success: -- bash <(curl -s https://codecov.io/bash) From 371c12079fa10cd3f93ba68aaf149070a7119d2b Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sun, 2 Jul 2023 09:47:53 -0400 Subject: [PATCH 110/174] Mark the email address argument to validate_email as positional-only --- CHANGELOG.md | 1 + email_validator/validate_email.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4961fc5..744644f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ In Development * Python 3.8+ is now required (support for Python 3.7 was dropped). * The old `email` field on the returned `ValidatedEmail` object, which in the previous version was superseded by `normalized`, will now raise a deprecation warning if used. See https://stackoverflow.com/q/879173 for strategies to suppress the DeprecationWarning. * A `__version__` module attribute is added. +* The email address argument to validate_email is now marked as positional-only to better reflect the documented usage using the new Python 3.8 feature. 2.0.0 (April 15, 2023) ---------------------- diff --git a/email_validator/validate_email.py b/email_validator/validate_email.py index 0d8f581..b33394a 100644 --- a/email_validator/validate_email.py +++ b/email_validator/validate_email.py @@ -7,8 +7,8 @@ def validate_email( email: Union[str, bytes], - # /, # not supported in Python 3.6, 3.7 - *, + /, # prior arguments are positional-only + *, # subsequent arguments are keyword-only allow_smtputf8: Optional[bool] = None, allow_empty_local: bool = False, allow_quoted_local: Optional[bool] = None, From 814b4884a6d6a804bd344101b0c9999d12f6d828 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sun, 2 Jul 2023 10:26:25 -0400 Subject: [PATCH 111/174] Use the new Python 3.8 walrus operator and simplify some if statements --- email_validator/syntax.py | 9 +++------ email_validator/validate_email.py | 3 +-- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/email_validator/syntax.py b/email_validator/syntax.py index abb4ea9..7287476 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -69,8 +69,7 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp # so if it was originally quoted (quoted_local_part is True) and this regex matches, # it's ok. # (RFC 5321 4.1.2 / RFC 5322 3.2.4). - m = DOT_ATOM_TEXT.match(local) - if m: + if DOT_ATOM_TEXT.match(local): # It's valid. And since it's just the permitted ASCII characters, # it's normalized and safe. If the local part was originally quoted, # the quoting was unnecessary and it'll be returned as normalized to @@ -89,8 +88,7 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp # RFC 6531 section 3.3. valid: Optional[str] = None requires_smtputf8 = False - m = DOT_ATOM_TEXT_INTL.match(local) - if m: + if DOT_ATOM_TEXT_INTL.match(local): # But international characters in the local part may not be permitted. if not allow_smtputf8: # Check for invalid characters against the non-internationalized @@ -347,8 +345,7 @@ def validate_email_domain_name(domain, test_environment=False, globally_delivera # Check the syntax of the string returned by idna.encode. # It should never fail. - m = DOT_ATOM_TEXT_HOSTNAME.match(ascii_domain) - if not m: + if not DOT_ATOM_TEXT_HOSTNAME.match(ascii_domain): raise EmailSyntaxError("The email address contains invalid characters after the @-sign after IDNA encoding.") # Check the length of the domain name in bytes. diff --git a/email_validator/validate_email.py b/email_validator/validate_email.py index b33394a..0d2e7a8 100644 --- a/email_validator/validate_email.py +++ b/email_validator/validate_email.py @@ -58,8 +58,7 @@ def validate_email( # part if the local part is quoted. If the address is quoted, # split it at a non-escaped @-sign and unescape the escaping. quoted_local_part = False - m = QUOTED_LOCAL_PART_ADDR.match(email) - if m: + if m := QUOTED_LOCAL_PART_ADDR.match(email): quoted_local_part = True local_part, domain_part = m.groups() From 786defc7c20e22341b338f56c8334eb4c7fce711 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Thu, 19 Oct 2023 07:16:26 -0400 Subject: [PATCH 112/174] Improve some code comments, refactor some code, mention length checks in the README --- README.md | 8 +++ email_validator/syntax.py | 83 ++++++++++++++++++++++++--- email_validator/validate_email.py | 93 +++++++++---------------------- tests/test_syntax.py | 4 +- 4 files changed, 111 insertions(+), 77 deletions(-) diff --git a/README.md b/README.md index 1652fcf..251e7a8 100644 --- a/README.md +++ b/README.md @@ -315,6 +315,14 @@ they are unnecessary. For IPv6 domain literals, the IPv6 address is normalized to condensed form. [RFC 2142](https://datatracker.ietf.org/doc/html/rfc2142) also requires lowercase normalization for some specific mailbox names like `postmaster@`. +### Length checks + +This library checks that the length of the email address is not longer than +the maximum length. The check is performed on the normalized form of the +address, which might be different from a string provided by a user. If you +send email to the original string and not the normalized address, the email +might be rejected because the original address could be too long. + Examples -------- diff --git a/email_validator/syntax.py b/email_validator/syntax.py index 7287476..fef785b 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -1,7 +1,8 @@ from .exceptions_types import EmailSyntaxError from .rfc_constants import EMAIL_MAX_LENGTH, LOCAL_PART_MAX_LENGTH, DOMAIN_MAX_LENGTH, \ DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT_RE, ATEXT_INTL_RE, ATEXT_HOSTNAME_INTL, QTEXT_INTL, \ - DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX, DOMAIN_LITERAL_CHARS + DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX, DOMAIN_LITERAL_CHARS, \ + QUOTED_LOCAL_PART_ADDR import re import unicodedata @@ -10,6 +11,35 @@ from typing import Optional +def split_email(email): + # Return the local part and domain part of the address and + # whether the local part was quoted as a three-tuple. + + # Typical email addresses have a single @-sign, but the + # awkward "quoted string" local part form (RFC 5321 4.1.2) + # allows @-signs (and escaped quotes) to appear in the local + # part if the local part is quoted. If the address is quoted, + # split it at a non-escaped @-sign and unescape the escaping. + if m := QUOTED_LOCAL_PART_ADDR.match(email): + local_part, domain_part = m.groups() + + # Since backslash-escaping is no longer needed because + # the quotes are removed, remove backslash-escaping + # to return in the normalized form. + import re + local_part = re.sub(r"\\(.)", "\\1", local_part) + + return local_part, domain_part, True + + else: + # Split at the one and only at-sign. + parts = email.split('@') + if len(parts) != 2: + raise EmailSyntaxError("The email address is not valid. It must have exactly one @-sign.") + local_part, domain_part = parts + return local_part, domain_part, False + + def get_length_reason(addr, utf8=False, limit=EMAIL_MAX_LENGTH): """Helper function to return an error message related to invalid length.""" diff = len(addr) - limit @@ -367,7 +397,7 @@ def validate_email_domain_name(domain, test_environment=False, globally_delivera raise EmailSyntaxError(f"After the @-sign, periods cannot be separated by so many characters {reason}.") if globally_deliverable: - # All publicly deliverable addresses have domain named with at least + # All publicly deliverable addresses have domain names with at least # one period, at least for gTLDs created since 2013 (per the ICANN Board # New gTLD Program Committee, https://www.icann.org/en/announcements/details/new-gtld-dotless-domain-names-prohibited-30-8-2013-en). # We'll consider the lack of a period a syntax error @@ -428,7 +458,48 @@ def validate_email_domain_name(domain, test_environment=False, globally_delivera } -def validate_email_domain_literal(domain_literal, allow_domain_literal=False): +def validate_email_length(addrinfo): + # If the email address has an ASCII representation, then we assume it may be + # transmitted in ASCII (we can't assume SMTPUTF8 will be used on all hops to + # the destination) and the length limit applies to ASCII characters (which is + # the same as octets). The number of characters in the internationalized form + # may be many fewer (because IDNA ASCII is verbose) and could be less than 254 + # Unicode characters, and of course the number of octets over the limit may + # not be the number of characters over the limit, so if the email address is + # internationalized, we can't give any simple information about why the address + # is too long. + if addrinfo.ascii_email and len(addrinfo.ascii_email) > EMAIL_MAX_LENGTH: + if addrinfo.ascii_email == addrinfo.normalized: + reason = get_length_reason(addrinfo.ascii_email) + elif len(addrinfo.normalized) > EMAIL_MAX_LENGTH: + # If there are more than 254 characters, then the ASCII + # form is definitely going to be too long. + reason = get_length_reason(addrinfo.normalized, utf8=True) + else: + reason = "(when converted to IDNA ASCII)" + raise EmailSyntaxError(f"The email address is too long {reason}.") + + # In addition, check that the UTF-8 encoding (i.e. not IDNA ASCII and not + # Unicode characters) is at most 254 octets. If the addres is transmitted using + # SMTPUTF8, then the length limit probably applies to the UTF-8 encoded octets. + # If the email address has an ASCII form that differs from its internationalized + # form, I don't think the internationalized form can be longer, and so the ASCII + # form length check would be sufficient. If there is no ASCII form, then we have + # to check the UTF-8 encoding. The UTF-8 encoding could be up to about four times + # longer than the number of characters. + # + # See the length checks on the local part and the domain. + if len(addrinfo.normalized.encode("utf8")) > EMAIL_MAX_LENGTH: + if len(addrinfo.normalized) > EMAIL_MAX_LENGTH: + # If there are more than 254 characters, then the UTF-8 + # encoding is definitely going to be too long. + reason = get_length_reason(addrinfo.normalized, utf8=True) + else: + reason = "(when encoded in bytes)" + raise EmailSyntaxError(f"The email address is too long {reason}.") + + +def validate_email_domain_literal(domain_literal): # This is obscure domain-literal syntax. Parse it and return # a compressed/normalized address. # RFC 5321 4.1.3 and RFC 5322 3.4.1. @@ -441,8 +512,6 @@ def validate_email_domain_literal(domain_literal, allow_domain_literal=False): addr = ipaddress.IPv4Address(domain_literal) except ValueError as e: raise EmailSyntaxError(f"The address in brackets after the @-sign is not valid: It is not an IPv4 address ({e}) or is missing an address literal tag.") - if not allow_domain_literal: - raise EmailSyntaxError("A bracketed IPv4 address after the @-sign is not allowed here.") # Return the IPv4Address object and the domain back unchanged. return { @@ -456,8 +525,6 @@ def validate_email_domain_literal(domain_literal, allow_domain_literal=False): addr = ipaddress.IPv6Address(domain_literal[5:]) except ValueError as e: raise EmailSyntaxError(f"The IPv6 address in brackets after the @-sign is not valid ({e}).") - if not allow_domain_literal: - raise EmailSyntaxError("A bracketed IPv6 address after the @-sign is not allowed here.") # Return the IPv6Address object and construct a normalized # domain literal. @@ -466,6 +533,8 @@ def validate_email_domain_literal(domain_literal, allow_domain_literal=False): "domain": f"[IPv6:{addr.compressed}]", } + # Nothing else is valid. + if ":" not in domain_literal: raise EmailSyntaxError("The part after the @-sign in brackets is not an IPv4 address and has no address literal tag.") diff --git a/email_validator/validate_email.py b/email_validator/validate_email.py index 0d2e7a8..d2791fe 100644 --- a/email_validator/validate_email.py +++ b/email_validator/validate_email.py @@ -1,8 +1,8 @@ from typing import Optional, Union from .exceptions_types import EmailSyntaxError, ValidatedEmail -from .syntax import validate_email_local_part, validate_email_domain_name, validate_email_domain_literal, get_length_reason -from .rfc_constants import EMAIL_MAX_LENGTH, QUOTED_LOCAL_PART_ADDR, CASE_INSENSITIVE_MAILBOX_NAMES +from .syntax import split_email, validate_email_local_part, validate_email_domain_name, validate_email_domain_literal, validate_email_length +from .rfc_constants import CASE_INSENSITIVE_MAILBOX_NAMES def validate_email( @@ -20,9 +20,9 @@ def validate_email( dns_resolver: Optional[object] = None ) -> ValidatedEmail: """ - Validates an email address, raising an EmailNotValidError if the address is not valid or returning a dict of - information when the address is valid. The email argument can be a str or a bytes instance, - but if bytes it must be ASCII-only. This is the main method of this library. + Given an email address, and some options, returns a ValidatedEmail instance + with information about the address if it is valid or, if the address is not + valid, raises an EmailNotValidError. This is the main function of the module. """ # Fill in default values of arguments. @@ -52,26 +52,13 @@ def validate_email( except ValueError: raise EmailSyntaxError("The email address is not valid ASCII.") - # Typical email addresses have a single @-sign, but the - # awkward "quoted string" local part form (RFC 5321 4.1.2) - # allows @-signs (and escaped quotes) to appear in the local - # part if the local part is quoted. If the address is quoted, - # split it at a non-escaped @-sign and unescape the escaping. - quoted_local_part = False - if m := QUOTED_LOCAL_PART_ADDR.match(email): - quoted_local_part = True - local_part, domain_part = m.groups() - - # Remove backslashes. - import re - local_part = re.sub(r"\\(.)", "\\1", local_part) - - else: - # Split at the one and only at-sign. - parts = email.split('@') - if len(parts) != 2: - raise EmailSyntaxError("The email address is not valid. It must have exactly one @-sign.") - local_part, domain_part = parts + # Split the address into the local part (before the @-sign) + # and the domain part (after the @-sign). Normally, there + # is only one @-sign. But the awkward "quoted string" local + # part form (RFC 5321 4.1.2) allows @-signs in the local + # part if the local part is quoted. + local_part, domain_part, is_quoted_local_part \ + = split_email(email) # Collect return values in this instance. ret = ValidatedEmail() @@ -84,13 +71,17 @@ def validate_email( local_part_info = validate_email_local_part(local_part, allow_smtputf8=allow_smtputf8, allow_empty_local=allow_empty_local, - quoted_local_part=quoted_local_part) - if quoted_local_part and not allow_quoted_local: - raise EmailSyntaxError("Quoting the part before the @-sign is not allowed here.") + quoted_local_part=is_quoted_local_part) ret.local_part = local_part_info["local_part"] ret.ascii_local_part = local_part_info["ascii_local_part"] ret.smtputf8 = local_part_info["smtputf8"] + # If a quoted local part isn't allowed but is present, now raise an exception. + # This is done after any exceptions raised by validate_email_local_part so + # that mandatory checks have highest precedence. + if is_quoted_local_part and not allow_quoted_local: + raise EmailSyntaxError("Quoting the part before the @-sign is not allowed here.") + # Some local parts are required to be case-insensitive, so we should normalize # to lowercase. # RFC 2142 @@ -107,7 +98,9 @@ def validate_email( elif domain_part.startswith("[") and domain_part.endswith("]"): # Parse the address in the domain literal and get back a normalized domain. - domain_part_info = validate_email_domain_literal(domain_part[1:-1], allow_domain_literal=allow_domain_literal) + domain_part_info = validate_email_domain_literal(domain_part[1:-1]) + if not allow_domain_literal: + raise EmailSyntaxError("A bracketed IP address after the @-sign is not allowed here.") ret.domain = domain_part_info["domain"] ret.ascii_domain = domain_part_info["domain"] # Domain literals are always ASCII. ret.domain_address = domain_part_info["domain_address"] @@ -131,48 +124,12 @@ def validate_email( else: ret.ascii_email = None - # If the email address has an ASCII representation, then we assume it may be - # transmitted in ASCII (we can't assume SMTPUTF8 will be used on all hops to - # the destination) and the length limit applies to ASCII characters (which is - # the same as octets). The number of characters in the internationalized form - # may be many fewer (because IDNA ASCII is verbose) and could be less than 254 - # Unicode characters, and of course the number of octets over the limit may - # not be the number of characters over the limit, so if the email address is - # internationalized, we can't give any simple information about why the address - # is too long. - # - # In addition, check that the UTF-8 encoding (i.e. not IDNA ASCII and not - # Unicode characters) is at most 254 octets. If the addres is transmitted using - # SMTPUTF8, then the length limit probably applies to the UTF-8 encoded octets. - # If the email address has an ASCII form that differs from its internationalized - # form, I don't think the internationalized form can be longer, and so the ASCII - # form length check would be sufficient. If there is no ASCII form, then we have - # to check the UTF-8 encoding. The UTF-8 encoding could be up to about four times - # longer than the number of characters. - # - # See the length checks on the local part and the domain. - if ret.ascii_email and len(ret.ascii_email) > EMAIL_MAX_LENGTH: - if ret.ascii_email == ret.normalized: - reason = get_length_reason(ret.ascii_email) - elif len(ret.normalized) > EMAIL_MAX_LENGTH: - # If there are more than 254 characters, then the ASCII - # form is definitely going to be too long. - reason = get_length_reason(ret.normalized, utf8=True) - else: - reason = "(when converted to IDNA ASCII)" - raise EmailSyntaxError(f"The email address is too long {reason}.") - if len(ret.normalized.encode("utf8")) > EMAIL_MAX_LENGTH: - if len(ret.normalized) > EMAIL_MAX_LENGTH: - # If there are more than 254 characters, then the UTF-8 - # encoding is definitely going to be too long. - reason = get_length_reason(ret.normalized, utf8=True) - else: - reason = "(when encoded in bytes)" - raise EmailSyntaxError(f"The email address is too long {reason}.") + # Check the length of the address. + validate_email_length(ret) if check_deliverability and not test_environment: # Validate the email address's deliverability using DNS - # and update the return dict with metadata. + # and update the returned ValidatedEmail object with metadata. if is_domain_literal: # There is nothing to check --- skip deliverability checks. diff --git a/tests/test_syntax.py b/tests/test_syntax.py index 1c9659c..8709845 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -330,9 +330,9 @@ def test_domain_literal(): ('me@xn--0.tld', 'The part after the @-sign is not valid IDNA (Invalid A-label).'), ('me@yy--0.tld', 'An email address cannot have two letters followed by two dashes immediately after the @-sign or after a period, except Punycode.'), ('me@yy--0.tld', 'An email address cannot have two letters followed by two dashes immediately after the @-sign or after a period, except Punycode.'), - ('me@[127.0.0.1]', 'A bracketed IPv4 address after the @-sign is not allowed here.'), + ('me@[127.0.0.1]', 'A bracketed IP address after the @-sign is not allowed here.'), ('me@[127.0.0.999]', 'The address in brackets after the @-sign is not valid: It is not an IPv4 address (Octet 999 (> 255) not permitted in \'127.0.0.999\') or is missing an address literal tag.'), - ('me@[IPv6:::1]', 'A bracketed IPv6 address after the @-sign is not allowed here.'), + ('me@[IPv6:::1]', 'A bracketed IP address after the @-sign is not allowed here.'), ('me@[IPv6:::G]', 'The IPv6 address in brackets after the @-sign is not valid (Only hex digits permitted in \'G\' in \'::G\').'), ('me@[tag:text]', 'The part after the @-sign contains an invalid address literal tag in brackets.'), ('me@[untaggedtext]', 'The part after the @-sign in brackets is not an IPv4 address and has no address literal tag.'), From 2c3501e367040978c4ecc00ddb7290ec50376aea Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sat, 21 Oct 2023 06:47:29 -0400 Subject: [PATCH 113/174] Fixes to debug helper ValidatedEmail.as_constructor --- email_validator/exceptions_types.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/email_validator/exceptions_types.py b/email_validator/exceptions_types.py index 4b8f200..3f6409c 100644 --- a/email_validator/exceptions_types.py +++ b/email_validator/exceptions_types.py @@ -76,7 +76,7 @@ def __getattr__(self, key): return self.original if key == "email": return self.normalized - raise AttributeError() + raise AttributeError(key) @property def email(self): @@ -129,9 +129,10 @@ def as_constructor(self): + ",".join("\n {}={}".format( key, repr(getattr(self, key))) - for key in ('email', 'local_part', 'domain', + for key in ('normalized', 'local_part', 'domain', 'ascii_email', 'ascii_local_part', 'ascii_domain', 'smtputf8', 'mx', 'mx_fallback_type') + if hasattr(self, key) ) \ + ")" From c1f37d6fe8b4604579ee3b3f22255263e0822e50 Mon Sep 17 00:00:00 2001 From: PriteshJadhav132 <126667465+PriteshJadhav132@users.noreply.github.com> Date: Sat, 21 Oct 2023 22:09:20 +0530 Subject: [PATCH 114/174] Remove typo in README (#117) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 251e7a8..5912e0e 100644 --- a/README.md +++ b/README.md @@ -163,7 +163,7 @@ while True: ### Test addresses -This library rejects email addresess that use the [Special Use Domain Names](https://www.iana.org/assignments/special-use-domain-names/special-use-domain-names.xhtml) `invalid`, `localhost`, `test`, and some others by raising `EmailSyntaxError`. This is to protect your system from abuse: You probably don't want a user to be able to cause an email to be sent to `localhost` (although they might be able to still do so via a malicious MX record). However, in your non-production test environments you may want to use `@test` or `@myname.test` email addresses. There are three ways you can allow this: +This library rejects email addresses that use the [Special Use Domain Names](https://www.iana.org/assignments/special-use-domain-names/special-use-domain-names.xhtml) `invalid`, `localhost`, `test`, and some others by raising `EmailSyntaxError`. This is to protect your system from abuse: You probably don't want a user to be able to cause an email to be sent to `localhost` (although they might be able to still do so via a malicious MX record). However, in your non-production test environments you may want to use `@test` or `@myname.test` email addresses. There are three ways you can allow this: 1. Add `test_environment=True` to the call to `validate_email` (see above). 2. Set `email_validator.TEST_ENVIRONMENT` to `True` globally. From 36b06110f6b177410fc0f53126b5998620542693 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sun, 22 Oct 2023 07:25:16 -0400 Subject: [PATCH 115/174] 2.1.0 --- CHANGELOG.md | 4 ++-- README.md | 2 +- email_validator/version.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 744644f..02d2277 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,5 @@ -In Development --------------- +2.1.0 (October 22, 2023) +------------------------ * Python 3.8+ is now required (support for Python 3.7 was dropped). * The old `email` field on the returned `ValidatedEmail` object, which in the previous version was superseded by `normalized`, will now raise a deprecation warning if used. See https://stackoverflow.com/q/879173 for strategies to suppress the DeprecationWarning. diff --git a/README.md b/README.md index 5912e0e..2c27c35 100644 --- a/README.md +++ b/README.md @@ -451,7 +451,7 @@ To release: * Publish a source and wheel distribution to pypi (see command below). ```sh -git tag v$(grep version setup.cfg | sed "s/.*= //") +git tag v$(cat email_validator/version.py | sed "s/.* = //" | sed 's/"//g') git push --tags ./release_to_pypi.sh ``` diff --git a/email_validator/version.py b/email_validator/version.py index 80476c8..9aa3f90 100644 --- a/email_validator/version.py +++ b/email_validator/version.py @@ -1 +1 @@ -__version__ = "2.0.0.post2" +__version__ = "2.1.0" From c3e81090641f3a1e96975ed150e0a416f6a96cb5 Mon Sep 17 00:00:00 2001 From: tianwei Date: Mon, 23 Oct 2023 18:02:47 +0800 Subject: [PATCH 116/174] Drop Python 3.7 from python_requires (#118) --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 2394d15..a69971d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -27,7 +27,7 @@ packages = find: install_requires = dnspython>=2.0.0 # optional if deliverability check isn't needed idna>=2.0.0 -python_requires = >=3.7 +python_requires = >=3.8 [options.package_data] * = py.typed From fd655c0967feb64f537db9342cf55c44b87fb80e Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Mon, 23 Oct 2023 06:04:32 -0400 Subject: [PATCH 117/174] 2.1.0.post1 --- email_validator/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/email_validator/version.py b/email_validator/version.py index 9aa3f90..acc96f7 100644 --- a/email_validator/version.py +++ b/email_validator/version.py @@ -1 +1 @@ -__version__ = "2.1.0" +__version__ = "2.1.0.post1" From 86c761c361bea317915e5835eaadf0edbda8c67a Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Thu, 26 Oct 2023 08:42:19 -0400 Subject: [PATCH 118/174] Remove outdated Python syntax using pyupgrade --- email_validator/__init__.py | 2 -- email_validator/exceptions_types.py | 2 +- email_validator/rfc_constants.py | 2 +- email_validator/syntax.py | 28 ++++++++++++++-------------- 4 files changed, 16 insertions(+), 18 deletions(-) diff --git a/email_validator/__init__.py b/email_validator/__init__.py index c3b5929..cd1b301 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - # Export the main method, helper methods, and the public data types. from .exceptions_types import ValidatedEmail, EmailNotValidError, \ EmailSyntaxError, EmailUndeliverableError diff --git a/email_validator/exceptions_types.py b/email_validator/exceptions_types.py index 3f6409c..81fed39 100644 --- a/email_validator/exceptions_types.py +++ b/email_validator/exceptions_types.py @@ -17,7 +17,7 @@ class EmailUndeliverableError(EmailNotValidError): pass -class ValidatedEmail(object): +class ValidatedEmail: """The validate_email function returns objects of this type holding the normalized form of the email address and other information.""" diff --git a/email_validator/rfc_constants.py b/email_validator/rfc_constants.py index d5961d6..3ab54e7 100644 --- a/email_validator/rfc_constants.py +++ b/email_validator/rfc_constants.py @@ -12,7 +12,7 @@ # addresses to also include three specific ranges of UTF8 defined in # RFC 3629 section 4, which appear to be the Unicode code points from # U+0080 to U+10FFFF. -ATEXT_INTL = ATEXT + u"\u0080-\U0010FFFF" +ATEXT_INTL = ATEXT + "\u0080-\U0010FFFF" ATEXT_INTL_RE = re.compile('[.' + ATEXT_INTL + ']') # ATEXT_INTL plus dots DOT_ATOM_TEXT_INTL = re.compile('[' + ATEXT_INTL + ']+(?:\\.[' + ATEXT_INTL + r']+)*\Z') diff --git a/email_validator/syntax.py b/email_validator/syntax.py index fef785b..4d7bfce 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -124,11 +124,11 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp # Check for invalid characters against the non-internationalized # permitted character set. # (RFC 5322 3.2.3) - bad_chars = set( + bad_chars = { safe_character_display(c) for c in local if not ATEXT_RE.match(c) - ) + } if bad_chars: raise EmailSyntaxError("Internationalized characters before the @-sign are not supported: " + ", ".join(sorted(bad_chars)) + ".") @@ -148,20 +148,20 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp # (RFC 5321 4.1.2. RFC 5322 lists additional permitted *obsolete* # characters which are *not* allowed here. RFC 6531 section 3.3 # extends the range to UTF8 strings.) - bad_chars = set( + bad_chars = { safe_character_display(c) for c in local if not QTEXT_INTL.match(c) - ) + } if bad_chars: raise EmailSyntaxError("The email address contains invalid characters in quotes before the @-sign: " + ", ".join(sorted(bad_chars)) + ".") # See if any characters are outside of the ASCII range. - bad_chars = set( + bad_chars = { safe_character_display(c) for c in local if not (32 <= ord(c) <= 126) - ) + } if bad_chars: requires_smtputf8 = True @@ -213,11 +213,11 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp # Check for invalid characters. # (RFC 5322 3.2.3, plus RFC 6531 3.3) - bad_chars = set( + bad_chars = { safe_character_display(c) for c in local if not ATEXT_INTL_RE.match(c) - ) + } if bad_chars: raise EmailSyntaxError("The email address contains invalid characters before the @-sign: " + ", ".join(sorted(bad_chars)) + ".") @@ -306,11 +306,11 @@ def validate_email_domain_name(domain, test_environment=False, globally_delivera # Check for invalid characters before normalization. # (RFC 952 plus RFC 6531 section 3.3 for internationalized addresses) - bad_chars = set( + bad_chars = { safe_character_display(c) for c in domain if not ATEXT_HOSTNAME_INTL.match(c) - ) + } if bad_chars: raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".") @@ -437,11 +437,11 @@ def validate_email_domain_name(domain, test_environment=False, globally_delivera # Check for invalid characters after normalization. These # should never arise. See the similar checks above. - bad_chars = set( + bad_chars = { safe_character_display(c) for c in domain if not ATEXT_HOSTNAME_INTL.match(c) - ) + } if bad_chars: raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".") check_unsafe_chars(domain) @@ -544,11 +544,11 @@ def validate_email_domain_literal(domain_literal): # Check for permitted ASCII characters. This actually doesn't matter # since there will be an exception after anyway. - bad_chars = set( + bad_chars = { safe_character_display(c) for c in domain_literal if not DOMAIN_LITERAL_CHARS.match(c) - ) + } if bad_chars: raise EmailSyntaxError("The part after the @-sign contains invalid characters in brackets: " + ", ".join(sorted(bad_chars)) + ".") From b0f6661140bb9ca907f854be0f623e6d6be3c80e Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Mon, 13 Nov 2023 19:28:23 -0500 Subject: [PATCH 119/174] Fix typo 'marking' instead of 'marketing' in case-insensitive mailbox name list Fixed #121. --- email_validator/rfc_constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/email_validator/rfc_constants.py b/email_validator/rfc_constants.py index 3ab54e7..a6b9c59 100644 --- a/email_validator/rfc_constants.py +++ b/email_validator/rfc_constants.py @@ -46,7 +46,7 @@ # RFC 2142 CASE_INSENSITIVE_MAILBOX_NAMES = [ - 'info', 'marking', 'sales', 'support', # section 3 + 'info', 'marketing', 'sales', 'support', # section 3 'abuse', 'noc', 'security', # section 4 'postmaster', 'hostmaster', 'usenet', 'news', 'webmaster', 'www', 'uucp', 'ftp', # section 5 ] From f3eaf100b10965f20e96e6fbfb8a8d37e56e121c Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Wed, 22 Nov 2023 16:01:55 -0500 Subject: [PATCH 120/174] Use an f-string in a utility method, recommended by fixit --- email_validator/exceptions_types.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/email_validator/exceptions_types.py b/email_validator/exceptions_types.py index 81fed39..88bbf05 100644 --- a/email_validator/exceptions_types.py +++ b/email_validator/exceptions_types.py @@ -126,9 +126,7 @@ def __eq__(self, other): """This helps producing the README.""" def as_constructor(self): return "ValidatedEmail(" \ - + ",".join("\n {}={}".format( - key, - repr(getattr(self, key))) + + ",".join(f"\n {key}={repr(getattr(self, key))}" for key in ('normalized', 'local_part', 'domain', 'ascii_email', 'ascii_local_part', 'ascii_domain', 'smtputf8', 'mx', 'mx_fallback_type') From 5d881897713a03877b8862c1277f9685f6b6273b Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Wed, 22 Nov 2023 16:06:26 -0500 Subject: [PATCH 121/174] Remove a redundant elif condition handled in the previous elif block --- email_validator/syntax.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/email_validator/syntax.py b/email_validator/syntax.py index 4d7bfce..e6a5038 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -264,7 +264,7 @@ def check_unsafe_chars(s, allow_space=False): # as far as I can tell, but they violate the spirit of the non-internationalized # specification that email addresses do not contain line breaks when not quoted. bad_chars.add(c) - elif category[0] in ("C", "Z"): + elif category[0] == "C": # Control, format, surrogate, private use, and unassigned code points (C) # are all unsafe in various ways. Control and format characters can affect # text rendering if the email address is concatenated with other text. From b32933dd457d4d055dd8b857b5cbb6ebabbaa544 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Wed, 22 Nov 2023 16:14:00 -0500 Subject: [PATCH 122/174] Improve some code comments --- email_validator/syntax.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/email_validator/syntax.py b/email_validator/syntax.py index e6a5038..3b5f204 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -322,15 +322,17 @@ def validate_email_domain_name(domain, test_environment=False, globally_delivera # Perform UTS-46 normalization, which includes casefolding, NFC normalization, # and converting all label separators (the period/full stop, fullwidth full stop, - # ideographic full stop, and halfwidth ideographic full stop) to basic periods. + # ideographic full stop, and halfwidth ideographic full stop) to regular dots. # It will also raise an exception if there is an invalid character in the input, - # such as "⒈" which is invalid because it would expand to include a period. + # such as "⒈" which is invalid because it would expand to include a dot. + # Since several characters are normalized to a dot, this has to come before + # checks related to dots, like check_dot_atom which comes next. try: domain = idna.uts46_remap(domain, std3_rules=False, transitional=False) except idna.IDNAError as e: raise EmailSyntaxError(f"The part after the @-sign contains invalid characters ({e}).") - # The domain part is made up period-separated "labels." Each label must + # The domain part is made up dot-separated "labels." Each label must # have at least one character and cannot start or end with dashes, which # means there are some surprising restrictions on periods and dashes. # Check that before we do IDNA encoding because the IDNA library gives @@ -362,6 +364,8 @@ def validate_email_domain_name(domain, test_environment=False, globally_delivera # For ASCII-only domains, the transformation does nothing and is safe to # apply. However, to ensure we don't rely on the idna library for basic # syntax checks, we don't use it if it's not needed. + # + # uts46 is off here because it is handled above. try: ascii_domain = idna.encode(domain, uts46=False).decode("ascii") except idna.IDNAError as e: @@ -371,6 +375,9 @@ def validate_email_domain_name(domain, test_environment=False, globally_delivera # one the user supplied. Also I'm not sure if the length check applies # to the internationalized form, the IDNA ASCII form, or even both! raise EmailSyntaxError("The email address is too long after the @-sign.") + + # Other errors seem to not be possible because the call to idna.uts46_remap + # would have already raised them. raise EmailSyntaxError(f"The part after the @-sign contains invalid characters ({e}).") # Check the syntax of the string returned by idna.encode. From 6e825bfc706051b0a3614c80981c7116afb798bd Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Wed, 22 Nov 2023 16:17:37 -0500 Subject: [PATCH 123/174] Update mocked-dns-answers.json and sort answers for stable diffs going forward This is done by setting mocked_dns_response.BUILD_MOCKED_DNS_RESPONSE_DATA = True. Some queries are removed. Probably because of earlier changes that some exceptions end DNS checks earlier. --- tests/mocked-dns-answers.json | 34 +++++----------------------------- tests/mocked_dns_response.py | 4 ++-- 2 files changed, 7 insertions(+), 31 deletions(-) diff --git a/tests/mocked-dns-answers.json b/tests/mocked-dns-answers.json index d5f6761..7e43747 100644 --- a/tests/mocked-dns-answers.json +++ b/tests/mocked-dns-answers.json @@ -7,20 +7,12 @@ }, "answer": [ "10 alt1.gmail-smtp-in.l.google.com.", - "30 alt3.gmail-smtp-in.l.google.com.", - "5 gmail-smtp-in.l.google.com.", "20 alt2.gmail-smtp-in.l.google.com.", - "40 alt4.gmail-smtp-in.l.google.com." + "30 alt3.gmail-smtp-in.l.google.com.", + "40 alt4.gmail-smtp-in.l.google.com.", + "5 gmail-smtp-in.l.google.com." ] }, - { - "query": { - "name": "xkxufoekjvjfjeodlfmdfjcu.com", - "type": "MX", - "class": "IN" - }, - "answer": [] - }, { "query": { "name": "xkxufoekjvjfjeodlfmdfjcu.com", @@ -64,18 +56,10 @@ "class": "IN" }, "answer": [ - "\"v=spf1 -all\"", - "\"MS=ms47108184\"" + "\"MS=ms47108184\"", + "\"v=spf1 -all\"" ] }, - { - "query": { - "name": "mail.example", - "type": "MX", - "class": "IN" - }, - "answer": [] - }, { "query": { "name": "mail.example", @@ -84,14 +68,6 @@ }, "answer": [] }, - { - "query": { - "name": "mail.example.com", - "type": "MX", - "class": "IN" - }, - "answer": [] - }, { "query": { "name": "mail.example.com", diff --git a/tests/mocked_dns_response.py b/tests/mocked_dns_response.py index cd32796..136de10 100644 --- a/tests/mocked_dns_response.py +++ b/tests/mocked_dns_response.py @@ -72,10 +72,10 @@ def save(self): "type": dns.rdatatype.to_text(key[1]), "class": dns.rdataclass.to_text(key[2]), }, - "answer": [ + "answer": sorted([ rr.to_text() for rr in value - ] + ]) } for key, value in self.data.items() ] From 8aa9223bdc78688c8e08d332fd0baad38c5e6d68 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Wed, 22 Nov 2023 16:35:00 -0500 Subject: [PATCH 124/174] Improve code coverage with a domain that returns NOANSWER instead of NXDOMAIN --- tests/mocked-dns-answers.json | 24 ++++++++++++++++++++++++ tests/test_deliverability.py | 8 +++++++- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/tests/mocked-dns-answers.json b/tests/mocked-dns-answers.json index 7e43747..19e443c 100644 --- a/tests/mocked-dns-answers.json +++ b/tests/mocked-dns-answers.json @@ -60,6 +60,30 @@ "\"v=spf1 -all\"" ] }, + { + "query": { + "name": "justtxt.joshdata.me", + "type": "MX", + "class": "IN" + }, + "answer": [] + }, + { + "query": { + "name": "justtxt.joshdata.me", + "type": "A", + "class": "IN" + }, + "answer": [] + }, + { + "query": { + "name": "justtxt.joshdata.me", + "type": "AAAA", + "class": "IN" + }, + "answer": [] + }, { "query": { "name": "mail.example", diff --git a/tests/test_deliverability.py b/tests/test_deliverability.py index 7431668..52124eb 100644 --- a/tests/test_deliverability.py +++ b/tests/test_deliverability.py @@ -21,7 +21,7 @@ def test_deliverability_found(): def test_deliverability_fails(): - # No MX record. + # Domain does not exist. domain = 'xkxufoekjvjfjeodlfmdfjcu.com' with pytest.raises(EmailUndeliverableError, match=f'The domain name {domain} does not exist'): validate_email_deliverability(domain, domain, dns_resolver=RESOLVER) @@ -36,6 +36,12 @@ def test_deliverability_fails(): with pytest.raises(EmailUndeliverableError, match=f'The domain name {domain} does not send email'): validate_email_deliverability(domain, domain, dns_resolver=RESOLVER) + # No MX or A/AAAA records, but some other DNS records must + # exist such that the response is NOANSWER instead of NXDOMAIN. + domain = 'justtxt.joshdata.me' + with pytest.raises(EmailUndeliverableError, match=f'The domain name {domain} does not accept email'): + validate_email_deliverability(domain, domain, dns_resolver=RESOLVER) + @pytest.mark.parametrize( 'email_input', From 97c26c77d0725dad725664455ff6e7a34b5900b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9s=20Martano?= Date: Mon, 29 Jan 2024 11:00:47 -0300 Subject: [PATCH 125/174] Fix typo in README.md (#124) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2c27c35..5379c06 100644 --- a/README.md +++ b/README.md @@ -208,7 +208,7 @@ local parts, a wider range of Unicode characters are allowed. A surprisingly large number of Unicode characters are not safe to display, especially when the email address is concatenated with other text, so this -library tries to protect you by not permitting resvered, non-, private use, +library tries to protect you by not permitting reserved, non-, private use, formatting (which can be used to alter the display order of characters), whitespace, and control characters, and combining characters as the first character of the local part and the domain name (so that they From 5d72f53412821189ebc826100fb2a673530c5ac6 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sun, 22 Oct 2023 13:27:15 -0400 Subject: [PATCH 126/174] Relicense under the Unlicense (instead of CC0) I didn't do a comprehensive review of contributions from others but they have generally been fairly minimal. See #113 for rationale. --- CONTRIBUTING.md | 12 ++-- LICENSE | 148 +++++++++--------------------------------------- README.md | 6 +- setup.cfg | 4 +- 4 files changed, 40 insertions(+), 130 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a0b40f9..76e88b9 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,7 +1,7 @@ -## Public domain +This project is in the public domain. Copyright and related +rights in the work are waived through the [LICENSE](LICENSE) +file in this directory. -This project is in the public domain. Copyright and related rights in the work worldwide are waived through the [CC0 1.0 Universal public domain dedication][CC0]. See the LICENSE file in this directory. - -All contributions to this project must be released under the same CC0 wavier. By submitting a pull request or patch, you are agreeing to comply with this waiver of copyright interest. - -[CC0]: http://creativecommons.org/publicdomain/zero/1.0/ +All contributions to this project must be released under the +same terms. By submitting a pull request or patch, you are +agreeing to comply with this. diff --git a/LICENSE b/LICENSE index 0e259d4..122e7a7 100644 --- a/LICENSE +++ b/LICENSE @@ -1,121 +1,27 @@ -Creative Commons Legal Code - -CC0 1.0 Universal - - CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE - LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN - ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS - INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES - REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS - PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM - THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED - HEREUNDER. - -Statement of Purpose - -The laws of most jurisdictions throughout the world automatically confer -exclusive Copyright and Related Rights (defined below) upon the creator -and subsequent owner(s) (each and all, an "owner") of an original work of -authorship and/or a database (each, a "Work"). - -Certain owners wish to permanently relinquish those rights to a Work for -the purpose of contributing to a commons of creative, cultural and -scientific works ("Commons") that the public can reliably and without fear -of later claims of infringement build upon, modify, incorporate in other -works, reuse and redistribute as freely as possible in any form whatsoever -and for any purposes, including without limitation commercial purposes. -These owners may contribute to the Commons to promote the ideal of a free -culture and the further production of creative, cultural and scientific -works, or to gain reputation or greater distribution for their Work in -part through the use and efforts of others. - -For these and/or other purposes and motivations, and without any -expectation of additional consideration or compensation, the person -associating CC0 with a Work (the "Affirmer"), to the extent that he or she -is an owner of Copyright and Related Rights in the Work, voluntarily -elects to apply CC0 to the Work and publicly distribute the Work under its -terms, with knowledge of his or her Copyright and Related Rights in the -Work and the meaning and intended legal effect of CC0 on those rights. - -1. Copyright and Related Rights. A Work made available under CC0 may be -protected by copyright and related or neighboring rights ("Copyright and -Related Rights"). Copyright and Related Rights include, but are not -limited to, the following: - - i. the right to reproduce, adapt, distribute, perform, display, - communicate, and translate a Work; - ii. moral rights retained by the original author(s) and/or performer(s); -iii. publicity and privacy rights pertaining to a person's image or - likeness depicted in a Work; - iv. rights protecting against unfair competition in regards to a Work, - subject to the limitations in paragraph 4(a), below; - v. rights protecting the extraction, dissemination, use and reuse of data - in a Work; - vi. database rights (such as those arising under Directive 96/9/EC of the - European Parliament and of the Council of 11 March 1996 on the legal - protection of databases, and under any national implementation - thereof, including any amended or successor version of such - directive); and -vii. other similar, equivalent or corresponding rights throughout the - world based on applicable law or treaty, and any national - implementations thereof. - -2. Waiver. To the greatest extent permitted by, but not in contravention -of, applicable law, Affirmer hereby overtly, fully, permanently, -irrevocably and unconditionally waives, abandons, and surrenders all of -Affirmer's Copyright and Related Rights and associated claims and causes -of action, whether now known or unknown (including existing as well as -future claims and causes of action), in the Work (i) in all territories -worldwide, (ii) for the maximum duration provided by applicable law or -treaty (including future time extensions), (iii) in any current or future -medium and for any number of copies, and (iv) for any purpose whatsoever, -including without limitation commercial, advertising or promotional -purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each -member of the public at large and to the detriment of Affirmer's heirs and -successors, fully intending that such Waiver shall not be subject to -revocation, rescission, cancellation, termination, or any other legal or -equitable action to disrupt the quiet enjoyment of the Work by the public -as contemplated by Affirmer's express Statement of Purpose. - -3. Public License Fallback. Should any part of the Waiver for any reason -be judged legally invalid or ineffective under applicable law, then the -Waiver shall be preserved to the maximum extent permitted taking into -account Affirmer's express Statement of Purpose. In addition, to the -extent the Waiver is so judged Affirmer hereby grants to each affected -person a royalty-free, non transferable, non sublicensable, non exclusive, -irrevocable and unconditional license to exercise Affirmer's Copyright and -Related Rights in the Work (i) in all territories worldwide, (ii) for the -maximum duration provided by applicable law or treaty (including future -time extensions), (iii) in any current or future medium and for any number -of copies, and (iv) for any purpose whatsoever, including without -limitation commercial, advertising or promotional purposes (the -"License"). The License shall be deemed effective as of the date CC0 was -applied by Affirmer to the Work. Should any part of the License for any -reason be judged legally invalid or ineffective under applicable law, such -partial invalidity or ineffectiveness shall not invalidate the remainder -of the License, and in such case Affirmer hereby affirms that he or she -will not (i) exercise any of his or her remaining Copyright and Related -Rights in the Work or (ii) assert any associated claims and causes of -action with respect to the Work, in either case contrary to Affirmer's -express Statement of Purpose. - -4. Limitations and Disclaimers. - - a. No trademark or patent rights held by Affirmer are waived, abandoned, - surrendered, licensed or otherwise affected by this document. - b. Affirmer offers the Work as-is and makes no representations or - warranties of any kind concerning the Work, express, implied, - statutory or otherwise, including without limitation warranties of - title, merchantability, fitness for a particular purpose, non - infringement, or the absence of latent or other defects, accuracy, or - the present or absence of errors, whether or not discoverable, all to - the greatest extent permissible under applicable law. - c. Affirmer disclaims responsibility for clearing rights of other persons - that may apply to the Work or any use thereof, including without - limitation any person's Copyright and Related Rights in the Work. - Further, Affirmer disclaims responsibility for obtaining any necessary - consents, permissions or other rights required for any use of the - Work. - d. Affirmer understands and acknowledges that Creative Commons is not a - party to this document and has no duty or obligation with respect to - this CC0 or use of the Work. +This is free and unencumbered software released into the public +domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a +compiled binary, for any purpose, commercial or non-commercial, +and by any means. + +In jurisdictions that recognize copyright laws, the author or +authors of this software dedicate any and all copyright +interest in the software to the public domain. We make this +dedication for the benefit of the public at large and to the +detriment of our heirs and successors. We intend this +dedication to be an overt act of relinquishment in perpetuity +of all present and future rights to this software under +copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR +ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + +For more information, please refer to diff --git a/README.md b/README.md index 5379c06..8b3b7b8 100644 --- a/README.md +++ b/README.md @@ -423,7 +423,6 @@ or likely to cause trouble: No one uses these forms anymore, and I can't think of any reason why anyone using this library would need to accept them. - Testing ------- @@ -455,3 +454,8 @@ git tag v$(cat email_validator/version.py | sed "s/.* = //" | sed 's/"//g') git push --tags ./release_to_pypi.sh ``` + +License +------- + +This project is free of any copyright restrictions per the [Unlicense](https://unlicense.org/). (Prior to Feb. 4, 2024, the project was made available under the terms of the [CC0 1.0 Universal public domain dedication](http://creativecommons.org/publicdomain/zero/1.0/).) See [LICENSE](LICENSE) and [CONTRIBUTING.md](CONTRIBUTING.md). \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index a69971d..3958ce0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -7,12 +7,12 @@ long_description_content_type = text/markdown url = https://github.com/JoshData/python-email-validator author = Joshua Tauberer author_email = jt@occams.info -license = CC0 (copyright waived) +license = Unlicense license_files = LICENSE classifiers = Development Status :: 5 - Production/Stable Intended Audience :: Developers - License :: CC0 1.0 Universal (CC0 1.0) Public Domain Dedication + License :: Unlicense Programming Language :: Python :: 3 Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 From a5842577875a34c36acb10b3f0f39a60796ba7c8 Mon Sep 17 00:00:00 2001 From: Martsinkevich Artur <74377629+mrcnkwcz@users.noreply.github.com> Date: Wed, 7 Feb 2024 17:29:54 +0300 Subject: [PATCH 127/174] Improvements for more PEP compliance (duplicate imports, raise from, else's) (#125) 1. Removed duplicate imports 2. Added `raise from` for better exception tracing 3. Removed `else` in the check that handles the exception --- email_validator/deliverability.py | 10 ++++----- email_validator/exceptions_types.py | 1 - email_validator/syntax.py | 33 ++++++++++++++--------------- email_validator/validate_email.py | 4 ++-- 4 files changed, 23 insertions(+), 25 deletions(-) diff --git a/email_validator/deliverability.py b/email_validator/deliverability.py index 4846091..95d73bc 100644 --- a/email_validator/deliverability.py +++ b/email_validator/deliverability.py @@ -72,12 +72,12 @@ def validate_email_deliverability(domain: str, domain_i18n: str, timeout: Option deliverability_info["mx"] = [(0, str(r)) for r in response] deliverability_info["mx_fallback_type"] = "AAAA" - except dns.resolver.NoAnswer: + except dns.resolver.NoAnswer as e: # If there was no MX, A, or AAAA record, then mail to # this domain is not deliverable, although the domain # name has other records (otherwise NXDOMAIN would # have been raised). - raise EmailUndeliverableError(f"The domain name {domain_i18n} does not accept email.") + raise EmailUndeliverableError(f"The domain name {domain_i18n} does not accept email.") from e # Check for a SPF (RFC 7208) reject-all record ("v=spf1 -all") which indicates # no emails are sent from this domain (similar to a Null MX record @@ -96,10 +96,10 @@ def validate_email_deliverability(domain: str, domain_i18n: str, timeout: Option # No TXT records means there is no SPF policy, so we cannot take any action. pass - except dns.resolver.NXDOMAIN: + except dns.resolver.NXDOMAIN as e: # The domain name does not exist --- there are no records of any sort # for the domain name. - raise EmailUndeliverableError(f"The domain name {domain_i18n} does not exist.") + raise EmailUndeliverableError(f"The domain name {domain_i18n} does not exist.") from e except dns.resolver.NoNameservers: # All nameservers failed to answer the query. This might be a problem @@ -122,6 +122,6 @@ def validate_email_deliverability(domain: str, domain_i18n: str, timeout: Option # Unhandled conditions should not propagate. raise EmailUndeliverableError( "There was an error while checking if the domain name in the email address is deliverable: " + str(e) - ) + ) from e return deliverability_info diff --git a/email_validator/exceptions_types.py b/email_validator/exceptions_types.py index 88bbf05..4522b4f 100644 --- a/email_validator/exceptions_types.py +++ b/email_validator/exceptions_types.py @@ -80,7 +80,6 @@ def __getattr__(self, key): @property def email(self): - import warnings warnings.warn("ValidatedEmail.email is deprecated and will be removed, use ValidatedEmail.normalized instead", DeprecationWarning) return self.normalized diff --git a/email_validator/syntax.py b/email_validator/syntax.py index 3b5f204..6634ace 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -26,7 +26,6 @@ def split_email(email): # Since backslash-escaping is no longer needed because # the quotes are removed, remove backslash-escaping # to return in the normalized form. - import re local_part = re.sub(r"\\(.)", "\\1", local_part) return local_part, domain_part, True @@ -72,14 +71,14 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp if len(local) == 0: if not allow_empty_local: raise EmailSyntaxError("There must be something before the @-sign.") - else: - # The caller allows an empty local part. Useful for validating certain - # Postfix aliases. - return { - "local_part": local, - "ascii_local_part": local, - "smtputf8": False, - } + + # The caller allows an empty local part. Useful for validating certain + # Postfix aliases. + return { + "local_part": local, + "ascii_local_part": local, + "smtputf8": False, + } # Check the length of the local part by counting characters. # (RFC 5321 4.5.3.1.1) @@ -191,8 +190,8 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp # want to have an unhandled exception later. try: local.encode("utf8") - except ValueError: - raise EmailSyntaxError("The email address contains an invalid character.") + except ValueError as e: + raise EmailSyntaxError("The email address contains an invalid character.") from e # If this address passes only by the quoted string form, re-quote it # and backslash-escape quotes and backslashes (removing any unnecessary @@ -330,7 +329,7 @@ def validate_email_domain_name(domain, test_environment=False, globally_delivera try: domain = idna.uts46_remap(domain, std3_rules=False, transitional=False) except idna.IDNAError as e: - raise EmailSyntaxError(f"The part after the @-sign contains invalid characters ({e}).") + raise EmailSyntaxError(f"The part after the @-sign contains invalid characters ({e}).") from e # The domain part is made up dot-separated "labels." Each label must # have at least one character and cannot start or end with dashes, which @@ -374,11 +373,11 @@ def validate_email_domain_name(domain, test_environment=False, globally_delivera # the length check is applied to a string that is different from the # one the user supplied. Also I'm not sure if the length check applies # to the internationalized form, the IDNA ASCII form, or even both! - raise EmailSyntaxError("The email address is too long after the @-sign.") + raise EmailSyntaxError("The email address is too long after the @-sign.") from e # Other errors seem to not be possible because the call to idna.uts46_remap # would have already raised them. - raise EmailSyntaxError(f"The part after the @-sign contains invalid characters ({e}).") + raise EmailSyntaxError(f"The part after the @-sign contains invalid characters ({e}).") from e # Check the syntax of the string returned by idna.encode. # It should never fail. @@ -440,7 +439,7 @@ def validate_email_domain_name(domain, test_environment=False, globally_delivera try: domain_i18n = idna.decode(ascii_domain.encode('ascii')) except idna.IDNAError as e: - raise EmailSyntaxError(f"The part after the @-sign is not valid IDNA ({e}).") + raise EmailSyntaxError(f"The part after the @-sign is not valid IDNA ({e}).") from e # Check for invalid characters after normalization. These # should never arise. See the similar checks above. @@ -518,7 +517,7 @@ def validate_email_domain_literal(domain_literal): try: addr = ipaddress.IPv4Address(domain_literal) except ValueError as e: - raise EmailSyntaxError(f"The address in brackets after the @-sign is not valid: It is not an IPv4 address ({e}) or is missing an address literal tag.") + raise EmailSyntaxError(f"The address in brackets after the @-sign is not valid: It is not an IPv4 address ({e}) or is missing an address literal tag.") from e # Return the IPv4Address object and the domain back unchanged. return { @@ -531,7 +530,7 @@ def validate_email_domain_literal(domain_literal): try: addr = ipaddress.IPv6Address(domain_literal[5:]) except ValueError as e: - raise EmailSyntaxError(f"The IPv6 address in brackets after the @-sign is not valid ({e}).") + raise EmailSyntaxError(f"The IPv6 address in brackets after the @-sign is not valid ({e}).") from e # Return the IPv6Address object and construct a normalized # domain literal. diff --git a/email_validator/validate_email.py b/email_validator/validate_email.py index d2791fe..d6051a9 100644 --- a/email_validator/validate_email.py +++ b/email_validator/validate_email.py @@ -49,8 +49,8 @@ def validate_email( if not isinstance(email, str): try: email = email.decode("ascii") - except ValueError: - raise EmailSyntaxError("The email address is not valid ASCII.") + except ValueError as e: + raise EmailSyntaxError("The email address is not valid ASCII.") from e # Split the address into the local part (before the @-sign) # and the domain part (after the @-sign). Normally, there From fd7ed97ad6090537163e08e851b3255482a7e4cd Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Mon, 12 Feb 2024 06:46:11 -0500 Subject: [PATCH 128/174] In tests, don't configure the dns.resolver.Resolver when using mocked DNS data The mocked DNS response class, which allows tests to run without a live Internet connection, creates a dns.resolver.Resolver that still tries to be configured using /etc/resolve.conf. That fails when the file cannot be read. When using mocked data, this is now disabled. Fixes #127. --- email_validator/deliverability.py | 4 ++-- tests/mocked_dns_response.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/email_validator/deliverability.py b/email_validator/deliverability.py index 95d73bc..182331a 100644 --- a/email_validator/deliverability.py +++ b/email_validator/deliverability.py @@ -6,11 +6,11 @@ import dns.exception -def caching_resolver(*, timeout: Optional[int] = None, cache=None): +def caching_resolver(*, timeout: Optional[int] = None, cache=None, dns_resolver=None): if timeout is None: from . import DEFAULT_TIMEOUT timeout = DEFAULT_TIMEOUT - resolver = dns.resolver.Resolver() + resolver = dns_resolver or dns.resolver.Resolver() resolver.cache = cache or dns.resolver.LRUCache() # type: ignore resolver.lifetime = timeout # type: ignore # timeout, in seconds return resolver diff --git a/tests/mocked_dns_response.py b/tests/mocked_dns_response.py index 136de10..1c7d157 100644 --- a/tests/mocked_dns_response.py +++ b/tests/mocked_dns_response.py @@ -32,7 +32,8 @@ def create_resolver(): # Return a new dns.resolver.Resolver configured for caching # using the singleton instance. - return caching_resolver(cache=MockedDnsResponseData.INSTANCE) + dns_resolver = dns.resolver.Resolver(configure=BUILD_MOCKED_DNS_RESPONSE_DATA) + return caching_resolver(cache=MockedDnsResponseData.INSTANCE, dns_resolver=dns_resolver) def __init__(self): self.data = {} From 306948daaf179229b722083f30460ce476a5fc76 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Mon, 12 Feb 2024 06:47:28 -0500 Subject: [PATCH 129/174] Disable tests that (still) require Internet access in the default build in the Makefile test_caching_dns_resolver still requires Internet access. It has a pytest.mark decorator, but "make test" still runs it. Now "make test" skips this test. It's probably duplicative with the other tests because the MockedDnsResponseData resolver uses a dns.resolver.Resolver cache internally. Fixes #128. --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 7233a19..7898e4f 100644 --- a/Makefile +++ b/Makefile @@ -17,7 +17,7 @@ typing: .PHONY: test test: - PYTHONPATH=.:$PYTHONPATH pytest --cov=email_validator + PYTHONPATH=.:$PYTHONPATH pytest --cov=email_validator -k "not network" .PHONY: testcov testcov: test From 19c4d34f5d59fa0f6d52c0d875f59e4fa81bc2ce Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Mon, 12 Feb 2024 08:33:07 -0500 Subject: [PATCH 130/174] Add tests that quoted local parts are unquoted in the returned normalized address where possible --- README.md | 4 ++-- tests/test_syntax.py | 30 ++++++++++++++++++++++++++++-- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 8b3b7b8..921af3d 100644 --- a/README.md +++ b/README.md @@ -12,8 +12,8 @@ necessarily for composing an email message, see below). Key features: -* Checks that an email address has the correct syntax --- good for - registration/login forms or other uses related to identifying users. +* Checks that an email address has the correct syntax --- great for + email-based registration/login forms or validing data. * Gives friendly English error messages when validation fails that you can display to end-users. * Checks deliverability (optional): Does the domain name resolve? diff --git a/tests/test_syntax.py b/tests/test_syntax.py index 8709845..693d7da 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -68,15 +68,41 @@ ascii_email='jeff@xn--fiqq24b10vi0d.tw', ), ), + ( + '"quoted local part"@example.org', + ValidatedEmail( + local_part='"quoted local part"', + ascii_local_part='"quoted local part"', + smtputf8=False, + ascii_domain='example.org', + domain='example.org', + normalized='"quoted local part"@example.org', + ascii_email='"quoted local part"@example.org' + ), + ), + ( + '"de-quoted.local.part"@example.org', + ValidatedEmail( + local_part='de-quoted.local.part', + ascii_local_part='de-quoted.local.part', + smtputf8=False, + ascii_domain='example.org', + domain='example.org', + normalized='de-quoted.local.part@example.org', + ascii_email='de-quoted.local.part@example.org' + ), + ), ], ) def test_email_valid(email_input, output): # These addresses do not require SMTPUTF8. See test_email_valid_intl_local_part # for addresses that are valid but require SMTPUTF8. Check that it passes with # allow_smtput8 both on and off. - emailinfo = validate_email(email_input, check_deliverability=False, allow_smtputf8=False) + emailinfo = validate_email(email_input, check_deliverability=False, allow_smtputf8=False, + allow_quoted_local=True) assert emailinfo == output - assert validate_email(email_input, check_deliverability=False, allow_smtputf8=True) == output + assert validate_email(email_input, check_deliverability=False, allow_smtputf8=True, + allow_quoted_local=True) == output # Check that the old `email` attribute to access the normalized form still works # if the DeprecationWarning is suppressed. From 676e0ab8def3cbee6fc212ad66cd55c22a66df41 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Mon, 12 Feb 2024 08:59:01 -0500 Subject: [PATCH 131/174] Version 2.1.1 --- CHANGELOG.md | 10 ++++++++++ email_validator/version.py | 2 +- release_to_pypi.sh | 3 ++- setup.cfg | 2 +- 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 02d2277..7321afc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +2.1.1 (February 26, 2024) +------------------------- + +* Fixed typo 'marking' instead of 'marketing' in case-insensitive mailbox name list. +* When DNS-based deliverability checks fail, in some cases exceptions are now thrown with `raise ... from` for better nested exception tracking. +* Fixed tests to work when no local resolver can be configured. +* This project is now licensed under the Unlicense (instead of CC0). +* Minor improvements to tests. +* Minor improvements to code style. + 2.1.0 (October 22, 2023) ------------------------ diff --git a/email_validator/version.py b/email_validator/version.py index acc96f7..58039f5 100644 --- a/email_validator/version.py +++ b/email_validator/version.py @@ -1 +1 @@ -__version__ = "2.1.0.post1" +__version__ = "2.1.1" diff --git a/release_to_pypi.sh b/release_to_pypi.sh index efef293..466f4f8 100755 --- a/release_to_pypi.sh +++ b/release_to_pypi.sh @@ -1,4 +1,5 @@ -#!/bin/sh +#!/bin/bash +source env/bin/activate pip3 install --upgrade build twine rm -rf dist python3 -m build diff --git a/setup.cfg b/setup.cfg index 3958ce0..3387df1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -12,7 +12,7 @@ license_files = LICENSE classifiers = Development Status :: 5 - Production/Stable Intended Audience :: Developers - License :: Unlicense + License :: OSI Approved :: The Unlicense (Unlicense) Programming Language :: Python :: 3 Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 From c309197c3b8bea880d36c7efc745e23ba26c2b65 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Mon, 26 Feb 2024 21:37:44 -0500 Subject: [PATCH 132/174] Update test_requirements to latest package versions supported on Py 3.8 --- test_requirements.txt | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/test_requirements.txt b/test_requirements.txt index db9bbbd..8ba9879 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -7,20 +7,20 @@ # the earliest Python version we support, and some exception # messages may depend on package versions, so we pin versions # for reproducible testing.) -coverage==7.3.2 -dnspython==2.4.2 -exceptiongroup==1.1.3 -flake8==6.1.0 -idna==3.4 +coverage==7.4.3 +dnspython==2.6.1 +exceptiongroup==1.2.0 +flake8==7.0.0 +idna==3.6 iniconfig==2.0.0 mccabe==0.7.0 -mypy==1.6.1 +mypy==1.8.0 mypy-extensions==1.0.0 packaging==23.2 -pluggy==1.3.0 +pluggy==1.4.0 pycodestyle==2.11.1 -pyflakes==3.1.0 -pytest==7.4.2 +pyflakes==3.2.0 +pytest==8.0.2 pytest-cov==4.1.0 tomli==2.0.1 -typing_extensions==4.8.0 +typing_extensions==4.10.0 From ea5254678ff5cd65f9ec8c1d9728700a375a2c97 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Mon, 26 Feb 2024 21:44:46 -0500 Subject: [PATCH 133/174] Add missing pyproject.toml file which may explain why tests requiring internet access were not working for others --- pyproject.toml | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..1379d17 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,4 @@ +[tool.pytest.ini_options] +markers = [ + "network: marks tests as requiring Internet access", +] From 1f2690cbe74a0e4d61e3ed08f08a469fe2f0ae1b Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Fri, 12 Apr 2024 06:39:49 -0400 Subject: [PATCH 134/174] Update test_requirements to bump idna, fixes #135 --- test_requirements.txt | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/test_requirements.txt b/test_requirements.txt index 8ba9879..d05813d 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -7,20 +7,20 @@ # the earliest Python version we support, and some exception # messages may depend on package versions, so we pin versions # for reproducible testing.) -coverage==7.4.3 +coverage==7.4.4 dnspython==2.6.1 exceptiongroup==1.2.0 flake8==7.0.0 -idna==3.6 +idna==3.7 iniconfig==2.0.0 mccabe==0.7.0 -mypy==1.8.0 +mypy==1.9.0 mypy-extensions==1.0.0 -packaging==23.2 +packaging==24.0 pluggy==1.4.0 pycodestyle==2.11.1 pyflakes==3.2.0 -pytest==8.0.2 -pytest-cov==4.1.0 +pytest==8.1.1 +pytest-cov==5.0.0 tomli==2.0.1 -typing_extensions==4.10.0 +typing_extensions==4.11.0 From 7011e6990e97eba86d21ae99d1cc2a609d2ca0bc Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sun, 14 Apr 2024 13:28:36 -0400 Subject: [PATCH 135/174] Fix escaping of `$` in Makefile to adjust PATH --- CHANGELOG.md | 5 +++++ Makefile | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7321afc..93bf5a3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +In Development +-------------- + +* Fixes in tests. + 2.1.1 (February 26, 2024) ------------------------- diff --git a/Makefile b/Makefile index 7898e4f..57df9da 100644 --- a/Makefile +++ b/Makefile @@ -17,7 +17,7 @@ typing: .PHONY: test test: - PYTHONPATH=.:$PYTHONPATH pytest --cov=email_validator -k "not network" + PYTHONPATH=.:$$PYTHONPATH pytest --cov=email_validator -k "not network" .PHONY: testcov testcov: test From d6d3d15da96483e830c851a65059ae651c08b96d Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sun, 14 Apr 2024 13:38:47 -0400 Subject: [PATCH 136/174] Add a deliverability test for a MX-fallback A record --- tests/mocked-dns-answers.json | 29 +++++++++++++++++++++++++++++ tests/test_deliverability.py | 18 ++++++++++-------- 2 files changed, 39 insertions(+), 8 deletions(-) diff --git a/tests/mocked-dns-answers.json b/tests/mocked-dns-answers.json index 19e443c..ddc46b7 100644 --- a/tests/mocked-dns-answers.json +++ b/tests/mocked-dns-answers.json @@ -13,6 +13,35 @@ "5 gmail-smtp-in.l.google.com." ] }, + { + "query": { + "name": "pages.github.com", + "type": "MX", + "class": "IN" + }, + "answer": [] + }, + { + "query": { + "name": "pages.github.com", + "type": "A", + "class": "IN" + }, + "answer": [ + "185.199.108.153", + "185.199.109.153", + "185.199.110.153", + "185.199.111.153" + ] + }, + { + "query": { + "name": "pages.github.com", + "type": "TXT", + "class": "IN" + }, + "answer": [] + }, { "query": { "name": "xkxufoekjvjfjeodlfmdfjcu.com", diff --git a/tests/test_deliverability.py b/tests/test_deliverability.py index 52124eb..7411c02 100644 --- a/tests/test_deliverability.py +++ b/tests/test_deliverability.py @@ -10,14 +10,16 @@ RESOLVER = MockedDnsResponseData.create_resolver() -def test_deliverability_found(): - response = validate_email_deliverability('gmail.com', 'gmail.com', dns_resolver=RESOLVER) - assert response.keys() == {'mx', 'mx_fallback_type'} - assert response['mx_fallback_type'] is None - assert len(response['mx']) > 1 - assert len(response['mx'][0]) == 2 - assert isinstance(response['mx'][0][0], int) - assert response['mx'][0][1].endswith('.com') +@pytest.mark.parametrize( + 'domain,expected_response', + [ + ('gmail.com', {'mx': [(5, 'gmail-smtp-in.l.google.com'), (10, 'alt1.gmail-smtp-in.l.google.com'), (20, 'alt2.gmail-smtp-in.l.google.com'), (30, 'alt3.gmail-smtp-in.l.google.com'), (40, 'alt4.gmail-smtp-in.l.google.com')], 'mx_fallback_type': None}), + ('pages.github.com', {'mx': [(0, '185.199.108.153'), (0, '185.199.109.153'), (0, '185.199.111.153'), (0, '185.199.110.153')], 'mx_fallback_type': 'A'}), + ], +) +def test_deliverability_found(domain, expected_response): + response = validate_email_deliverability(domain, domain, dns_resolver=RESOLVER) + assert response == expected_response def test_deliverability_fails(): From da48fd13bcccc0df52d90f4709d3cf9ec257e181 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sun, 14 Apr 2024 13:39:21 -0400 Subject: [PATCH 137/174] Fix the returned implicit MX record when there is a fallback The object returned by validate_email returns the queried MX records when deliverability checks are run. When there is an implicit MX record (no MX record but an A or AAAA record), the value is a single entry that points to the host, not a list of the A or AAAA values. SMTP 5321 5.1: > If an empty list of MXs is returned, the address is treated as if it was associated with an implicit MX R, with a preference of 0, pointing to that host. --- CHANGELOG.md | 1 + email_validator/deliverability.py | 4 ++-- tests/test_deliverability.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 93bf5a3..01dbef6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ In Development -------------- +* When a domain name has no MX record but does have an A or AAAA record, the mx field in the object returned by validate_email incorrectly held the IP addresses rather than the domain itself. * Fixes in tests. 2.1.1 (February 26, 2024) diff --git a/email_validator/deliverability.py b/email_validator/deliverability.py index 182331a..65eea51 100644 --- a/email_validator/deliverability.py +++ b/email_validator/deliverability.py @@ -60,7 +60,7 @@ def validate_email_deliverability(domain: str, domain_i18n: str, timeout: Option # If there was no MX record, fall back to an A record. (RFC 5321 Section 5) try: response = dns_resolver.resolve(domain, "A") - deliverability_info["mx"] = [(0, str(r)) for r in response] + deliverability_info["mx"] = [(0, domain)] deliverability_info["mx_fallback_type"] = "A" except dns.resolver.NoAnswer: @@ -69,7 +69,7 @@ def validate_email_deliverability(domain: str, domain_i18n: str, timeout: Option # (It's unclear if SMTP servers actually do this.) try: response = dns_resolver.resolve(domain, "AAAA") - deliverability_info["mx"] = [(0, str(r)) for r in response] + deliverability_info["mx"] = [(0, domain)] deliverability_info["mx_fallback_type"] = "AAAA" except dns.resolver.NoAnswer as e: diff --git a/tests/test_deliverability.py b/tests/test_deliverability.py index 7411c02..17dace5 100644 --- a/tests/test_deliverability.py +++ b/tests/test_deliverability.py @@ -14,7 +14,7 @@ 'domain,expected_response', [ ('gmail.com', {'mx': [(5, 'gmail-smtp-in.l.google.com'), (10, 'alt1.gmail-smtp-in.l.google.com'), (20, 'alt2.gmail-smtp-in.l.google.com'), (30, 'alt3.gmail-smtp-in.l.google.com'), (40, 'alt4.gmail-smtp-in.l.google.com')], 'mx_fallback_type': None}), - ('pages.github.com', {'mx': [(0, '185.199.108.153'), (0, '185.199.109.153'), (0, '185.199.111.153'), (0, '185.199.110.153')], 'mx_fallback_type': 'A'}), + ('pages.github.com', {'mx': [(0, 'pages.github.com')], 'mx_fallback_type': 'A'}), ], ) def test_deliverability_found(domain, expected_response): From 8ec4239eb1c64c6e4840913dc7c81ecf5a25097e Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sun, 14 Apr 2024 13:51:38 -0400 Subject: [PATCH 138/174] Parameterize test_deliverability_fails --- tests/test_deliverability.py | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/tests/test_deliverability.py b/tests/test_deliverability.py index 17dace5..262e252 100644 --- a/tests/test_deliverability.py +++ b/tests/test_deliverability.py @@ -22,26 +22,20 @@ def test_deliverability_found(domain, expected_response): assert response == expected_response -def test_deliverability_fails(): - # Domain does not exist. - domain = 'xkxufoekjvjfjeodlfmdfjcu.com' - with pytest.raises(EmailUndeliverableError, match=f'The domain name {domain} does not exist'): - validate_email_deliverability(domain, domain, dns_resolver=RESOLVER) - - # Null MX record. - domain = 'example.com' - with pytest.raises(EmailUndeliverableError, match=f'The domain name {domain} does not accept email'): - validate_email_deliverability(domain, domain, dns_resolver=RESOLVER) - - # No MX record, A record fallback, reject-all SPF record. - domain = 'nellis.af.mil' - with pytest.raises(EmailUndeliverableError, match=f'The domain name {domain} does not send email'): - validate_email_deliverability(domain, domain, dns_resolver=RESOLVER) +@pytest.mark.parametrize( + 'domain,error', + [ + ('xkxufoekjvjfjeodlfmdfjcu.com', 'The domain name {domain} does not exist'), + ('example.com', 'The domain name {domain} does not accept email'), # Null MX record + ('nellis.af.mil', 'The domain name {domain} does not send email'), # No MX record, A record fallback, reject-all SPF record. - # No MX or A/AAAA records, but some other DNS records must - # exist such that the response is NOANSWER instead of NXDOMAIN. - domain = 'justtxt.joshdata.me' - with pytest.raises(EmailUndeliverableError, match=f'The domain name {domain} does not accept email'): + # No MX or A/AAAA records, but some other DNS records must + # exist such that the response is NOANSWER instead of NXDOMAIN. + ('justtxt.joshdata.me', 'The domain name {domain} does not accept email'), + ], +) +def test_deliverability_fails(domain, error): + with pytest.raises(EmailUndeliverableError, match=error.format(domain=domain)): validate_email_deliverability(domain, domain, dns_resolver=RESOLVER) From 3b1b45c1fba162b509ad008584281f2c29c95434 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sun, 14 Apr 2024 13:53:29 -0400 Subject: [PATCH 139/174] Check that fallback A/AAAA records are globally reachable IP addresses, fixes #134 --- CHANGELOG.md | 1 + email_validator/deliverability.py | 28 +++++++++++++++++++++++++++- tests/mocked-dns-answers.json | 26 ++++++++++++++++++++++++++ tests/test_deliverability.py | 1 + 4 files changed, 55 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 01dbef6..1e41c4c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ In Development -------------- +* When a domain name has no MX record but does have an A or AAAA record, if none of the IP addresses in the response are globally reachable (i.e. not Private-Use, Loopback, etc.), the response is treated as if there was no A/AAAA response and the email address will fail the deliverability check. * When a domain name has no MX record but does have an A or AAAA record, the mx field in the object returned by validate_email incorrectly held the IP addresses rather than the domain itself. * Fixes in tests. diff --git a/email_validator/deliverability.py b/email_validator/deliverability.py index 65eea51..e2e5076 100644 --- a/email_validator/deliverability.py +++ b/email_validator/deliverability.py @@ -1,5 +1,7 @@ from typing import Optional, Any, Dict +import ipaddress + from .exceptions_types import EmailUndeliverableError import dns.resolver @@ -57,9 +59,29 @@ def validate_email_deliverability(domain: str, domain_i18n: str, timeout: Option deliverability_info["mx_fallback_type"] = None except dns.resolver.NoAnswer: - # If there was no MX record, fall back to an A record. (RFC 5321 Section 5) + # If there was no MX record, fall back to an A or AAA record + # (RFC 5321 Section 5). Check A first since it's more common. + + # If the A/AAAA response has no Globally Reachable IP address, + # treat the response as if it were NoAnswer, i.e., the following + # address types are not allowed fallbacks: Private-Use, Loopback, + # Link-Local, and some other obscure ranges. See + # https://www.iana.org/assignments/iana-ipv4-special-registry/iana-ipv4-special-registry.xhtml + # https://www.iana.org/assignments/iana-ipv6-special-registry/iana-ipv6-special-registry.xhtml + # (Issue #134.) + def is_global_addr(ipaddr): + try: + ipaddr = ipaddress.ip_address(ipaddr) + except ValueError: + return False + return ipaddr.is_global + try: response = dns_resolver.resolve(domain, "A") + + if not any(is_global_addr(r.address) for r in response): + raise dns.resolver.NoAnswer # fall back to AAAA + deliverability_info["mx"] = [(0, domain)] deliverability_info["mx_fallback_type"] = "A" @@ -69,6 +91,10 @@ def validate_email_deliverability(domain: str, domain_i18n: str, timeout: Option # (It's unclear if SMTP servers actually do this.) try: response = dns_resolver.resolve(domain, "AAAA") + + if not any(is_global_addr(r.address) for r in response): + raise dns.resolver.NoAnswer + deliverability_info["mx"] = [(0, domain)] deliverability_info["mx_fallback_type"] = "AAAA" diff --git a/tests/mocked-dns-answers.json b/tests/mocked-dns-answers.json index ddc46b7..12d3885 100644 --- a/tests/mocked-dns-answers.json +++ b/tests/mocked-dns-answers.json @@ -60,6 +60,32 @@ "0 ." ] }, + { + "query": { + "name": "g.mail.com", + "type": "MX", + "class": "IN" + }, + "answer": [] + }, + { + "query": { + "name": "g.mail.com", + "type": "A", + "class": "IN" + }, + "answer": [] + }, + { + "query": { + "name": "g.mail.com", + "type": "AAAA", + "class": "IN" + }, + "answer": [ + "::1" + ] + }, { "query": { "name": "nellis.af.mil", diff --git a/tests/test_deliverability.py b/tests/test_deliverability.py index 262e252..0ed5c3f 100644 --- a/tests/test_deliverability.py +++ b/tests/test_deliverability.py @@ -27,6 +27,7 @@ def test_deliverability_found(domain, expected_response): [ ('xkxufoekjvjfjeodlfmdfjcu.com', 'The domain name {domain} does not exist'), ('example.com', 'The domain name {domain} does not accept email'), # Null MX record + ('g.mail.com', 'The domain name {domain} does not accept email'), # No MX record but invalid AAAA record fallback (issue #134) ('nellis.af.mil', 'The domain name {domain} does not send email'), # No MX record, A record fallback, reject-all SPF record. # No MX or A/AAAA records, but some other DNS records must From 4691a6244f6bfad556cb8ea49591e8db51f59fcb Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Tue, 6 Feb 2024 07:20:34 -0500 Subject: [PATCH 140/174] Parse `display name ` syntax Per request in #116, parse display name syntax also, but don't allow it unless a new allow_display_name option is set. Parsing according to the MIME specification probably isn't what's generally wanted since the use case is probably parsing inputs in email composition-like user interfaces. So it's in the spirit of a MIME message but not the letter. If display name syntax is permitted, return the unquoted/unescaped display name in the returned object. --- CHANGELOG.md | 1 + README.md | 17 +-- email_validator/__init__.py | 1 + email_validator/exceptions_types.py | 7 +- email_validator/rfc_constants.py | 5 +- email_validator/syntax.py | 164 ++++++++++++++++++++++++---- email_validator/validate_email.py | 18 ++- tests/test_syntax.py | 50 ++++++++- 8 files changed, 220 insertions(+), 43 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1e41c4c..c353f8f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ In Development -------------- +* A new option to parse `My Name ` strings, i.e. a display name plus an email address in angle brackets, is now available. It is off by default. * When a domain name has no MX record but does have an A or AAAA record, if none of the IP addresses in the response are globally reachable (i.e. not Private-Use, Loopback, etc.), the response is treated as if there was no A/AAAA response and the email address will fail the deliverability check. * When a domain name has no MX record but does have an A or AAAA record, the mx field in the object returned by validate_email incorrectly held the IP addresses rather than the domain itself. * Fixes in tests. diff --git a/README.md b/README.md index 921af3d..2c12c93 100644 --- a/README.md +++ b/README.md @@ -7,8 +7,7 @@ Python 3.8+ by [Joshua Tauberer](https://joshdata.me). This library validates that a string is of the form `name@example.com` and optionally checks that the domain name is set up to receive email. This is the sort of validation you would want when you are identifying -users by their email address like on a registration/login form (but not -necessarily for composing an email message, see below). +users by their email address like on a registration form. Key features: @@ -18,7 +17,9 @@ Key features: can display to end-users. * Checks deliverability (optional): Does the domain name resolve? (You can override the default DNS resolver to add query caching.) -* Supports internationalized domain names and internationalized local parts. +* Supports internationalized domain names (like `@ツ.life`), + internationalized local parts (like `ツ@example.com`), + and optionally parses display names (e.g. `"My Name" `). * Rejects addresses with unsafe Unicode characters, obsolete email address syntax that you'd find unexpected, special use domain names like `@localhost`, and domains without a dot by default. This is an @@ -28,9 +29,8 @@ Key features: * Python type annotations are used. This is an opinionated library. You should definitely also consider using -the less-opinionated [pyIsEmail](https://github.com/michaelherold/pyIsEmail) and -[flanker](https://github.com/mailgun/flanker) if they are better for your -use case. +the less-opinionated [pyIsEmail](https://github.com/michaelherold/pyIsEmail) +if it works better for you. [![Build Status](https://github.com/JoshData/python-email-validator/actions/workflows/test_and_build.yaml/badge.svg)](https://github.com/JoshData/python-email-validator/actions/workflows/test_and_build.yaml) @@ -144,6 +144,8 @@ The `validate_email` function also accepts the following keyword arguments `allow_domain_literal=False`: Set to `True` to allow bracketed IPv4 and "IPv6:"-prefixd IPv6 addresses in the domain part of the email address. No deliverability checks are performed for these addresses. In the object returned by `validate_email`, the normalized domain will use the condensed IPv6 format, if applicable. The object's `domain_address` attribute will hold the parsed `ipaddress.IPv4Address` or `ipaddress.IPv6Address` object if applicable. You can also set `email_validator.ALLOW_DOMAIN_LITERAL` to `True` to turn this on for all calls by default. +`allow_display_name=False`: Set to `True` to allow a display name and bracketed address in the input string, like `My Name `. It's implemented in the spirit but not the letter of RFC 5322 3.4, so it may be stricter or more relaxed than what you want. The display name, if present, is provided in the returned object's `display_name` field after being unquoted and unescaped. You can also set `email_validator.ALLOW_DISPLAY_NAME` to `True` to turn this on for all calls by default. + `allow_empty_local=False`: Set to `True` to allow an empty local part (i.e. `@example.com`), e.g. for validating Postfix aliases. @@ -395,6 +397,7 @@ are: | `domain` | The canonical internationalized Unicode form of the domain part of the email address. If the returned string contains non-ASCII characters, either the [SMTPUTF8](https://tools.ietf.org/html/rfc6531) feature of your mail relay will be required to transmit the message or else the email address's domain part must be converted to IDNA ASCII first: Use `ascii_domain` field instead. | | `ascii_domain` | The [IDNA](https://tools.ietf.org/html/rfc5891) [Punycode](https://www.rfc-editor.org/rfc/rfc3492.txt)-encoded form of the domain part of the given email address, as it would be transmitted on the wire. | | `domain_address` | If domain literals are allowed and if the email address contains one, an `ipaddress.IPv4Address` or `ipaddress.IPv6Address` object. | +| `display_name` | If no display name was present and angle brackets do not surround the address, this will be `None`; otherwise, it will be set to the display name, or the empty string if there were angle brackets but no display name. If the display name was quoted, it will be unquoted and unescaped. | | `smtputf8` | A boolean indicating that the [SMTPUTF8](https://tools.ietf.org/html/rfc6531) feature of your mail relay will be required to transmit messages to this address because the local part of the address has non-ASCII characters (the local part cannot be IDNA-encoded). If `allow_smtputf8=False` is passed as an argument, this flag will always be false because an exception is raised if it would have been true. | | `mx` | A list of (priority, domain) tuples of MX records specified in the DNS for the domain (see [RFC 5321 section 5](https://tools.ietf.org/html/rfc5321#section-5)). May be `None` if the deliverability check could not be completed because of a temporary issue like a timeout. | | `mx_fallback_type` | `None` if an `MX` record is found. If no MX records are actually specified in DNS and instead are inferred, through an obsolete mechanism, from A or AAAA records, the value is the type of DNS record used instead (`A` or `AAAA`). May be `None` if the deliverability check could not be completed because of a temporary issue like a timeout. | @@ -458,4 +461,4 @@ git push --tags License ------- -This project is free of any copyright restrictions per the [Unlicense](https://unlicense.org/). (Prior to Feb. 4, 2024, the project was made available under the terms of the [CC0 1.0 Universal public domain dedication](http://creativecommons.org/publicdomain/zero/1.0/).) See [LICENSE](LICENSE) and [CONTRIBUTING.md](CONTRIBUTING.md). \ No newline at end of file +This project is free of any copyright restrictions per the [Unlicense](https://unlicense.org/). (Prior to Feb. 4, 2024, the project was made available under the terms of the [CC0 1.0 Universal public domain dedication](http://creativecommons.org/publicdomain/zero/1.0/).) See [LICENSE](LICENSE) and [CONTRIBUTING.md](CONTRIBUTING.md). diff --git a/email_validator/__init__.py b/email_validator/__init__.py index cd1b301..3f10088 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -25,6 +25,7 @@ def caching_resolver(*args, **kwargs): ALLOW_SMTPUTF8 = True ALLOW_QUOTED_LOCAL = False ALLOW_DOMAIN_LITERAL = False +ALLOW_DISPLAY_NAME = False GLOBALLY_DELIVERABLE = True CHECK_DELIVERABILITY = True TEST_ENVIRONMENT = False diff --git a/email_validator/exceptions_types.py b/email_validator/exceptions_types.py index 4522b4f..7483b0b 100644 --- a/email_validator/exceptions_types.py +++ b/email_validator/exceptions_types.py @@ -62,6 +62,9 @@ class ValidatedEmail: mechanism, from A or AAAA records, the value is the type of DNS record used instead (`A` or `AAAA`).""" mx_fallback_type: str + """The display name in the original input text, unquoted and unescaped, or None.""" + display_name: str + """Tests use this constructor.""" def __init__(self, **kwargs): for k, v in kwargs.items(): @@ -120,6 +123,7 @@ def __eq__(self, other): and repr(sorted(self.mx) if getattr(self, 'mx', None) else None) == repr(sorted(other.mx) if getattr(other, 'mx', None) else None) and getattr(self, 'mx_fallback_type', None) == getattr(other, 'mx_fallback_type', None) + and getattr(self, 'display_name', None) == getattr(other, 'display_name', None) ) """This helps producing the README.""" @@ -128,7 +132,8 @@ def as_constructor(self): + ",".join(f"\n {key}={repr(getattr(self, key))}" for key in ('normalized', 'local_part', 'domain', 'ascii_email', 'ascii_local_part', 'ascii_domain', - 'smtputf8', 'mx', 'mx_fallback_type') + 'smtputf8', 'mx', 'mx_fallback_type', + 'display_name') if hasattr(self, key) ) \ + ")" diff --git a/email_validator/rfc_constants.py b/email_validator/rfc_constants.py index a6b9c59..a802c97 100644 --- a/email_validator/rfc_constants.py +++ b/email_validator/rfc_constants.py @@ -13,7 +13,7 @@ # RFC 3629 section 4, which appear to be the Unicode code points from # U+0080 to U+10FFFF. ATEXT_INTL = ATEXT + "\u0080-\U0010FFFF" -ATEXT_INTL_RE = re.compile('[.' + ATEXT_INTL + ']') # ATEXT_INTL plus dots +ATEXT_INTL_DOT_RE = re.compile('[.' + ATEXT_INTL + ']') # ATEXT_INTL plus dots DOT_ATOM_TEXT_INTL = re.compile('[' + ATEXT_INTL + ']+(?:\\.[' + ATEXT_INTL + r']+)*\Z') # The domain part of the email address, after IDNA (ASCII) encoding, @@ -30,10 +30,9 @@ # Quoted-string local part (RFC 5321 4.1.2, internationalized by RFC 6531 3.3) # The permitted characters in a quoted string are the characters in the range # 32-126, except that quotes and (literal) backslashes can only appear when escaped -# by a backslash. When internationalized, UTF8 strings are also permitted except +# by a backslash. When internationalized, UTF-8 strings are also permitted except # the ASCII characters that are not previously permitted (see above). # QUOTED_LOCAL_PART_ADDR = re.compile(r"^\"((?:[\u0020-\u0021\u0023-\u005B\u005D-\u007E]|\\[\u0020-\u007E])*)\"@(.*)") -QUOTED_LOCAL_PART_ADDR = re.compile(r"^\"((?:[^\"\\]|\\.)*)\"@(.*)") QTEXT_INTL = re.compile(r"[\u0020-\u007E\u0080-\U0010FFFF]") # Length constants diff --git a/email_validator/syntax.py b/email_validator/syntax.py index 6634ace..b8df0e6 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -1,8 +1,7 @@ from .exceptions_types import EmailSyntaxError from .rfc_constants import EMAIL_MAX_LENGTH, LOCAL_PART_MAX_LENGTH, DOMAIN_MAX_LENGTH, \ - DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT_RE, ATEXT_INTL_RE, ATEXT_HOSTNAME_INTL, QTEXT_INTL, \ - DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX, DOMAIN_LITERAL_CHARS, \ - QUOTED_LOCAL_PART_ADDR + DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT_RE, ATEXT_INTL_DOT_RE, ATEXT_HOSTNAME_INTL, QTEXT_INTL, \ + DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX, DOMAIN_LITERAL_CHARS import re import unicodedata @@ -12,31 +11,148 @@ def split_email(email): - # Return the local part and domain part of the address and - # whether the local part was quoted as a three-tuple. + # Return the display name, unescaped local part, and domain part + # of the address, and whether the local part was quoted. If no + # display name was present and angle brackets do not surround + # the address, display name will be None; otherwise, it will be + # set to the display name or the empty string if there were + # angle brackets but no display name. + + # Typical email addresses have a single @-sign and no quote + # characters, but the awkward "quoted string" local part form + # (RFC 5321 4.1.2) allows @-signs and escaped quotes to appear + # in the local part if the local part is quoted. + + # A `display name ` format is also present in MIME messages + # (RFC 5322 3.4) and this format is also often recognized in + # mail UIs. It's not allowed in SMTP commands or in typical web + # login forms, but parsing it has been requested, so it's done + # here as a convenience. It's implemented in the spirit but not + # the letter of RFC 5322 3.4 because MIME messages allow newlines + # and comments as a part of the CFWS rule, but this is typically + # not allowed in mail UIs (although comment syntax was requested + # once too). + # + # Display names are either basic characters (the same basic characters + # permitted in email addresses, but periods are not allowed and spaces + # are allowed; see RFC 5322 Appendix A.1.2), or or a quoted string with + # the same rules as a quoted local part. (Multiple quoted strings might + # be allowed? Unclear.) Optional space (RFC 5322 3.4 CFWS) and then the + # email address follows in angle brackets. + # + # An initial quote is ambiguous between starting a display name or + # a quoted local part --- fun. + # + # We assume the input string is already stripped of leading and + # trailing CFWS. + + def split_string_at_unquoted_special(text, specials): + # Split the string at the first character in specials (an @-sign + # or left angle bracket) that does not occur within quotes. + inside_quote = False + escaped = False + left_part = "" + for c in text: + if inside_quote: + left_part += c + if c == '\\' and not escaped: + escaped = True + elif c == '"' and not escaped: + # The only way to exit the quote is an unescaped quote. + inside_quote = False + escaped = False + else: + escaped = False + elif c == '"': + left_part += c + inside_quote = True + elif c in specials: + # When unquoted, stop before a special character. + break + else: + left_part += c + + # The right part is whatever is left. + right_part = text[len(left_part):] + + return left_part, right_part + + def unquote_quoted_string(text): + # Remove surrounding quotes and unescape escaped backslashes + # and quotes. Escapes are parsed liberally. I think only + # backslashes and quotes can be escaped but we'll allow anything + # to be. + quoted = False + escaped = False + value = "" + for i, c in enumerate(text): + if quoted: + if escaped: + value += c + escaped = False + elif c == '\\': + escaped = True + elif c == '"': + if i != len(text) - 1: + raise EmailSyntaxError("Extra character(s) found after close quote: " + + ", ".join(safe_character_display(c) for c in text[i + 1:])) + break + else: + value += c + elif i == 0 and c == '"': + quoted = True + else: + value += c + + return value, quoted + + # Split the string at the first unquoted @-sign or left angle bracket. + left_part, right_part = split_string_at_unquoted_special(email, ("@", "<")) + + # If the right part starts with an angle bracket, + # then the left part is a display name and the rest + # of the right part up to the final right angle bracket + # is the email address, . + if right_part.startswith("<"): + # Remove space between the display name and angle bracket. + left_part = left_part.rstrip() + + # Unquote and unescape the display name. + display_name, display_name_quoted = unquote_quoted_string(left_part) + + # Check that only basic characters are present in a + # non-quoted display name. + if not display_name_quoted: + bad_chars = { + safe_character_display(c) + for c in display_name + if (not ATEXT_RE.match(c) and c != ' ') or c == '.' + } + if bad_chars: + raise EmailSyntaxError("The display name contains invalid characters when not quoted: " + ", ".join(sorted(bad_chars)) + ".") - # Typical email addresses have a single @-sign, but the - # awkward "quoted string" local part form (RFC 5321 4.1.2) - # allows @-signs (and escaped quotes) to appear in the local - # part if the local part is quoted. If the address is quoted, - # split it at a non-escaped @-sign and unescape the escaping. - if m := QUOTED_LOCAL_PART_ADDR.match(email): - local_part, domain_part = m.groups() + # Check for other unsafe characters. + check_unsafe_chars(display_name, allow_space=True) - # Since backslash-escaping is no longer needed because - # the quotes are removed, remove backslash-escaping - # to return in the normalized form. - local_part = re.sub(r"\\(.)", "\\1", local_part) + # Remove the initial and trailing angle brackets. + addr_spec = right_part[1:].rstrip(">") - return local_part, domain_part, True + # Split the email address at the first unquoted @-sign. + local_part, domain_part = split_string_at_unquoted_special(addr_spec, ("@",)) + # Otherwise there is no display name. The left part is the local + # part and the right part is the domain. else: - # Split at the one and only at-sign. - parts = email.split('@') - if len(parts) != 2: - raise EmailSyntaxError("The email address is not valid. It must have exactly one @-sign.") - local_part, domain_part = parts - return local_part, domain_part, False + display_name = None + local_part, domain_part = left_part, right_part + + if domain_part.startswith("@"): + domain_part = domain_part[1:] + + # Unquote the local part if it is quoted. + local_part, is_quoted_local_part = unquote_quoted_string(local_part) + + return display_name, local_part, domain_part, is_quoted_local_part def get_length_reason(addr, utf8=False, limit=EMAIL_MAX_LENGTH): @@ -215,7 +331,7 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp bad_chars = { safe_character_display(c) for c in local - if not ATEXT_INTL_RE.match(c) + if not ATEXT_INTL_DOT_RE.match(c) } if bad_chars: raise EmailSyntaxError("The email address contains invalid characters before the @-sign: " + ", ".join(sorted(bad_chars)) + ".") diff --git a/email_validator/validate_email.py b/email_validator/validate_email.py index d6051a9..f73a479 100644 --- a/email_validator/validate_email.py +++ b/email_validator/validate_email.py @@ -13,6 +13,7 @@ def validate_email( allow_empty_local: bool = False, allow_quoted_local: Optional[bool] = None, allow_domain_literal: Optional[bool] = None, + allow_display_name: Optional[bool] = None, check_deliverability: Optional[bool] = None, test_environment: Optional[bool] = None, globally_deliverable: Optional[bool] = None, @@ -26,7 +27,7 @@ def validate_email( """ # Fill in default values of arguments. - from . import ALLOW_SMTPUTF8, ALLOW_QUOTED_LOCAL, ALLOW_DOMAIN_LITERAL, \ + from . import ALLOW_SMTPUTF8, ALLOW_QUOTED_LOCAL, ALLOW_DOMAIN_LITERAL, ALLOW_DISPLAY_NAME, \ GLOBALLY_DELIVERABLE, CHECK_DELIVERABILITY, TEST_ENVIRONMENT, DEFAULT_TIMEOUT if allow_smtputf8 is None: allow_smtputf8 = ALLOW_SMTPUTF8 @@ -34,6 +35,8 @@ def validate_email( allow_quoted_local = ALLOW_QUOTED_LOCAL if allow_domain_literal is None: allow_domain_literal = ALLOW_DOMAIN_LITERAL + if allow_display_name is None: + allow_display_name = ALLOW_DISPLAY_NAME if check_deliverability is None: check_deliverability = CHECK_DELIVERABILITY if test_environment is None: @@ -52,17 +55,20 @@ def validate_email( except ValueError as e: raise EmailSyntaxError("The email address is not valid ASCII.") from e - # Split the address into the local part (before the @-sign) - # and the domain part (after the @-sign). Normally, there - # is only one @-sign. But the awkward "quoted string" local - # part form (RFC 5321 4.1.2) allows @-signs in the local + # Split the address into the display name (or None), the local part + # (before the @-sign), and the domain part (after the @-sign). + # Normally, there is only one @-sign. But the awkward "quoted string" + # local part form (RFC 5321 4.1.2) allows @-signs in the local # part if the local part is quoted. - local_part, domain_part, is_quoted_local_part \ + display_name, local_part, domain_part, is_quoted_local_part \ = split_email(email) + if display_name is not None and not allow_display_name: + raise EmailSyntaxError("A display name and angle brackets around the email address are not permitted here.") # Collect return values in this instance. ret = ValidatedEmail() ret.original = email + ret.display_name = display_name # Validate the email address's local part syntax and get a normalized form. # If the original address was quoted and the decoded local part is a valid diff --git a/tests/test_syntax.py b/tests/test_syntax.py index 693d7da..65e3ec0 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -92,6 +92,45 @@ ascii_email='de-quoted.local.part@example.org' ), ), + ( + 'MyName ', + ValidatedEmail( + local_part='me', + ascii_local_part='me', + smtputf8=False, + ascii_domain='example.org', + domain='example.org', + normalized='me@example.org', + ascii_email='me@example.org', + display_name="MyName" + ), + ), + ( + 'My Name ', + ValidatedEmail( + local_part='me', + ascii_local_part='me', + smtputf8=False, + ascii_domain='example.org', + domain='example.org', + normalized='me@example.org', + ascii_email='me@example.org', + display_name="My Name" + ), + ), + ( + r'"My.\"Na\\me\".Is" <"me \" \\ me"@example.org>', + ValidatedEmail( + local_part=r'"me \" \\ me"', + ascii_local_part=r'"me \" \\ me"', + smtputf8=False, + ascii_domain='example.org', + domain='example.org', + normalized=r'"me \" \\ me"@example.org', + ascii_email=r'"me \" \\ me"@example.org', + display_name='My."Na\\me".Is' + ), + ), ], ) def test_email_valid(email_input, output): @@ -99,10 +138,11 @@ def test_email_valid(email_input, output): # for addresses that are valid but require SMTPUTF8. Check that it passes with # allow_smtput8 both on and off. emailinfo = validate_email(email_input, check_deliverability=False, allow_smtputf8=False, - allow_quoted_local=True) + allow_quoted_local=True, allow_display_name=True) + assert emailinfo == output assert validate_email(email_input, check_deliverability=False, allow_smtputf8=True, - allow_quoted_local=True) == output + allow_quoted_local=True, allow_display_name=True) == output # Check that the old `email` attribute to access the normalized form still works # if the DeprecationWarning is suppressed. @@ -363,6 +403,12 @@ def test_domain_literal(): ('me@[tag:text]', 'The part after the @-sign contains an invalid address literal tag in brackets.'), ('me@[untaggedtext]', 'The part after the @-sign in brackets is not an IPv4 address and has no address literal tag.'), ('me@[tag:invalid space]', 'The part after the @-sign contains invalid characters in brackets: SPACE.'), + ('', 'A display name and angle brackets around the email address are not permitted here.'), + ('DisplayName ', 'A display name and angle brackets around the email address are not permitted here.'), + ('Display Name ', 'A display name and angle brackets around the email address are not permitted here.'), + ('\"Display Name\" ', 'A display name and angle brackets around the email address are not permitted here.'), + ('Display.Name ', 'The display name contains invalid characters when not quoted: \'.\'.'), + ('\"Display.Name\" ', 'A display name and angle brackets around the email address are not permitted here.'), ], ) def test_email_invalid_syntax(email_input, error_msg): From 8d91a4519c5a92b64dda06487acafb33d02494ac Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Thu, 9 May 2024 05:22:52 -0400 Subject: [PATCH 141/174] Ratchet up mypy settings --- email_validator/__main__.py | 3 ++- email_validator/deliverability.py | 10 +++++----- email_validator/exceptions_types.py | 4 ++-- email_validator/syntax.py | 12 ++++++++++-- email_validator/validate_email.py | 10 ++++++++-- pyproject.toml | 13 +++++++++++++ tests/mocked_dns_response.py | 6 ++++-- 7 files changed, 44 insertions(+), 14 deletions(-) diff --git a/email_validator/__main__.py b/email_validator/__main__.py index a414ff6..1834894 100644 --- a/email_validator/__main__.py +++ b/email_validator/__main__.py @@ -17,6 +17,7 @@ import json import os import sys +from typing import Any, Dict from .validate_email import validate_email from .deliverability import caching_resolver @@ -27,7 +28,7 @@ def main(dns_resolver=None): # The dns_resolver argument is for tests. # Set options from environment variables. - options = {} + options: Dict[str, Any] = {} for varname in ('ALLOW_SMTPUTF8', 'ALLOW_QUOTED_LOCAL', 'ALLOW_DOMAIN_LITERAL', 'GLOBALLY_DELIVERABLE', 'CHECK_DELIVERABILITY', 'TEST_ENVIRONMENT'): if varname in os.environ: diff --git a/email_validator/deliverability.py b/email_validator/deliverability.py index e2e5076..ccefc8a 100644 --- a/email_validator/deliverability.py +++ b/email_validator/deliverability.py @@ -1,4 +1,4 @@ -from typing import Optional, Any, Dict +from typing import Any, Dict, Optional import ipaddress @@ -8,17 +8,17 @@ import dns.exception -def caching_resolver(*, timeout: Optional[int] = None, cache=None, dns_resolver=None): +def caching_resolver(*, timeout: Optional[int] = None, cache: Any = None, dns_resolver: Optional[dns.resolver.Resolver] = None) -> dns.resolver.Resolver: if timeout is None: from . import DEFAULT_TIMEOUT timeout = DEFAULT_TIMEOUT resolver = dns_resolver or dns.resolver.Resolver() - resolver.cache = cache or dns.resolver.LRUCache() # type: ignore - resolver.lifetime = timeout # type: ignore # timeout, in seconds + resolver.cache = cache or dns.resolver.LRUCache() + resolver.lifetime = timeout # timeout, in seconds return resolver -def validate_email_deliverability(domain: str, domain_i18n: str, timeout: Optional[int] = None, dns_resolver=None): +def validate_email_deliverability(domain: str, domain_i18n: str, timeout: Optional[int] = None, dns_resolver: Optional[dns.resolver.Resolver] = None) -> Dict[str, str]: # Check that the domain resolves to an MX record. If there is no MX record, # try an A or AAAA record which is a deprecated fallback for deliverability. # Raises an EmailUndeliverableError on failure. On success, returns a dict diff --git a/email_validator/exceptions_types.py b/email_validator/exceptions_types.py index 7483b0b..452cff3 100644 --- a/email_validator/exceptions_types.py +++ b/email_validator/exceptions_types.py @@ -1,5 +1,5 @@ import warnings -from typing import Optional +from typing import List, Optional, Tuple class EmailNotValidError(ValueError): @@ -56,7 +56,7 @@ class ValidatedEmail: """If a deliverability check is performed and if it succeeds, a list of (priority, domain) tuples of MX records specified in the DNS for the domain.""" - mx: list + mx: List[Tuple[int, str]] """If no MX records are actually specified in DNS and instead are inferred, through an obsolete mechanism, from A or AAAA records, the value is the type of DNS record used instead (`A` or `AAAA`).""" diff --git a/email_validator/syntax.py b/email_validator/syntax.py index b8df0e6..5e52100 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -7,7 +7,7 @@ import unicodedata import idna # implements IDNA 2008; Python's codec is only IDNA 2003 import ipaddress -from typing import Optional +from typing import Optional, TypedDict, Union def split_email(email): @@ -180,8 +180,14 @@ def safe_character_display(c): return unicodedata.name(c, h) +class LocalPartValidationResult(TypedDict): + local_part: str + ascii_local_part: Optional[str] + smtputf8: bool + + def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_empty_local: bool = False, - quoted_local_part: bool = False): + quoted_local_part: bool = False) -> LocalPartValidationResult: """Validates the syntax of the local part of an email address.""" if len(local) == 0: @@ -626,6 +632,8 @@ def validate_email_domain_literal(domain_literal): # a compressed/normalized address. # RFC 5321 4.1.3 and RFC 5322 3.4.1. + addr: Union[ipaddress.IPv4Address, ipaddress.IPv6Address] + # Try to parse the domain literal as an IPv4 address. # There is no tag for IPv4 addresses, so we can never # be sure if the user intends an IPv4 address. diff --git a/email_validator/validate_email.py b/email_validator/validate_email.py index f73a479..3d851ee 100644 --- a/email_validator/validate_email.py +++ b/email_validator/validate_email.py @@ -1,9 +1,15 @@ -from typing import Optional, Union +from typing import Optional, Union, TYPE_CHECKING from .exceptions_types import EmailSyntaxError, ValidatedEmail from .syntax import split_email, validate_email_local_part, validate_email_domain_name, validate_email_domain_literal, validate_email_length from .rfc_constants import CASE_INSENSITIVE_MAILBOX_NAMES +if TYPE_CHECKING: + import dns.resolver + _Resolver = dns.resolver.Resolver +else: + _Resolver = object + def validate_email( email: Union[str, bytes], @@ -18,7 +24,7 @@ def validate_email( test_environment: Optional[bool] = None, globally_deliverable: Optional[bool] = None, timeout: Optional[int] = None, - dns_resolver: Optional[object] = None + dns_resolver: Optional[_Resolver] = None ) -> ValidatedEmail: """ Given an email address, and some options, returns a ValidatedEmail instance diff --git a/pyproject.toml b/pyproject.toml index 1379d17..5d3a28f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,16 @@ +[tool.mypy] +disallow_any_generics = true +disallow_subclassing_any = true + +check_untyped_defs = true +disallow_incomplete_defs = true +# disallow_untyped_calls = true +disallow_untyped_decorators = true +# disallow_untyped_defs = true + +warn_redundant_casts = true +warn_unused_ignores = true + [tool.pytest.ini_options] markers = [ "network: marks tests as requiring Internet access", diff --git a/tests/mocked_dns_response.py b/tests/mocked_dns_response.py index 1c7d157..fc1f1a6 100644 --- a/tests/mocked_dns_response.py +++ b/tests/mocked_dns_response.py @@ -20,9 +20,11 @@ class MockedDnsResponseData: DATA_PATH = os.path.dirname(__file__) + "/mocked-dns-answers.json" + INSTANCE = None + @staticmethod def create_resolver(): - if not hasattr(MockedDnsResponseData, 'INSTANCE'): + if MockedDnsResponseData.INSTANCE is None: # Create a singleton instance of this class and load the saved DNS responses. # Except when BUILD_MOCKED_DNS_RESPONSE_DATA is true, don't load the data. singleton = MockedDnsResponseData() @@ -116,6 +118,6 @@ def put(self, key, value): @pytest.fixture(scope="session", autouse=True) def MockedDnsResponseDataCleanup(request): def cleanup_func(): - if BUILD_MOCKED_DNS_RESPONSE_DATA: + if BUILD_MOCKED_DNS_RESPONSE_DATA and MockedDnsResponseData.INSTANCE is not None: MockedDnsResponseData.INSTANCE.save() request.addfinalizer(cleanup_func) From 68019d7ad7198d90293d61cd8c78487509cdd617 Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Thu, 9 May 2024 05:29:30 -0400 Subject: [PATCH 142/174] Fix typo --- email_validator/exceptions_types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/email_validator/exceptions_types.py b/email_validator/exceptions_types.py index 452cff3..4a2e8fd 100644 --- a/email_validator/exceptions_types.py +++ b/email_validator/exceptions_types.py @@ -24,7 +24,7 @@ class ValidatedEmail: """The email address that was passed to validate_email. (If passed as bytes, this will be a string.)""" original: str - """The normalized email address, which should always be used in preferance to the original address. + """The normalized email address, which should always be used in preference to the original address. The normalized address converts an IDNA ASCII domain name to Unicode, if possible, and performs Unicode normalization on the local part and on the domain (if originally Unicode). It is the concatenation of the local_part and domain attributes, separated by an @-sign.""" From 5734e5e9c49b68f9bf7a26206ac6f70df7d66956 Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Thu, 9 May 2024 06:11:27 -0400 Subject: [PATCH 143/174] mypy: disallow_untyped_defs --- email_validator/__init__.py | 12 ++++++--- email_validator/__main__.py | 6 ++--- email_validator/deliverability.py | 4 +-- email_validator/exceptions_types.py | 20 +++++++-------- email_validator/syntax.py | 34 ++++++++++++++++--------- email_validator/validate_email.py | 14 +++++------ pyproject.toml | 2 +- tests/mocked_dns_response.py | 39 ++++++++++++++++------------- tests/test_deliverability.py | 20 ++++++++------- tests/test_main.py | 12 ++++----- tests/test_syntax.py | 24 +++++++++--------- 11 files changed, 103 insertions(+), 84 deletions(-) diff --git a/email_validator/__init__.py b/email_validator/__init__.py index 3f10088..626aa00 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -1,3 +1,5 @@ +from typing import TYPE_CHECKING + # Export the main method, helper methods, and the public data types. from .exceptions_types import ValidatedEmail, EmailNotValidError, \ EmailSyntaxError, EmailUndeliverableError @@ -9,12 +11,14 @@ "EmailSyntaxError", "EmailUndeliverableError", "caching_resolver", "__version__"] - -def caching_resolver(*args, **kwargs): - # Lazy load `deliverability` as it is slow to import (due to dns.resolver) +if TYPE_CHECKING: from .deliverability import caching_resolver +else: + def caching_resolver(*args, **kwargs): + # Lazy load `deliverability` as it is slow to import (due to dns.resolver) + from .deliverability import caching_resolver - return caching_resolver(*args, **kwargs) + return caching_resolver(*args, **kwargs) # These global attributes are a part of the library's API and can be diff --git a/email_validator/__main__.py b/email_validator/__main__.py index 1834894..52791c7 100644 --- a/email_validator/__main__.py +++ b/email_validator/__main__.py @@ -17,14 +17,14 @@ import json import os import sys -from typing import Any, Dict +from typing import Any, Dict, Optional -from .validate_email import validate_email +from .validate_email import validate_email, _Resolver from .deliverability import caching_resolver from .exceptions_types import EmailNotValidError -def main(dns_resolver=None): +def main(dns_resolver: Optional[_Resolver] = None) -> None: # The dns_resolver argument is for tests. # Set options from environment variables. diff --git a/email_validator/deliverability.py b/email_validator/deliverability.py index ccefc8a..6800557 100644 --- a/email_validator/deliverability.py +++ b/email_validator/deliverability.py @@ -69,9 +69,9 @@ def validate_email_deliverability(domain: str, domain_i18n: str, timeout: Option # https://www.iana.org/assignments/iana-ipv4-special-registry/iana-ipv4-special-registry.xhtml # https://www.iana.org/assignments/iana-ipv6-special-registry/iana-ipv6-special-registry.xhtml # (Issue #134.) - def is_global_addr(ipaddr): + def is_global_addr(address: Any) -> bool: try: - ipaddr = ipaddress.ip_address(ipaddr) + ipaddr = ipaddress.ip_address(address) except ValueError: return False return ipaddr.is_global diff --git a/email_validator/exceptions_types.py b/email_validator/exceptions_types.py index 4a2e8fd..e37bb9f 100644 --- a/email_validator/exceptions_types.py +++ b/email_validator/exceptions_types.py @@ -1,5 +1,5 @@ import warnings -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, Union class EmailNotValidError(ValueError): @@ -63,18 +63,18 @@ class ValidatedEmail: mx_fallback_type: str """The display name in the original input text, unquoted and unescaped, or None.""" - display_name: str + display_name: Optional[str] """Tests use this constructor.""" - def __init__(self, **kwargs): + def __init__(self, **kwargs: Any) -> None: for k, v in kwargs.items(): setattr(self, k, v) - def __repr__(self): + def __repr__(self) -> str: return f"" """For backwards compatibility, support old field names.""" - def __getattr__(self, key): + def __getattr__(self, key: str) -> str: if key == "original_email": return self.original if key == "email": @@ -82,13 +82,13 @@ def __getattr__(self, key): raise AttributeError(key) @property - def email(self): + def email(self) -> str: warnings.warn("ValidatedEmail.email is deprecated and will be removed, use ValidatedEmail.normalized instead", DeprecationWarning) return self.normalized """For backwards compatibility, some fields are also exposed through a dict-like interface. Note that some of the names changed when they became attributes.""" - def __getitem__(self, key): + def __getitem__(self, key: str) -> Union[Optional[str], bool, List[Tuple[int, str]]]: warnings.warn("dict-like access to the return value of validate_email is deprecated and may not be supported in the future.", DeprecationWarning, stacklevel=2) if key == "email": return self.normalized @@ -109,7 +109,7 @@ def __getitem__(self, key): raise KeyError() """Tests use this.""" - def __eq__(self, other): + def __eq__(self, other: object) -> bool: if not isinstance(other, ValidatedEmail): return False return ( @@ -127,7 +127,7 @@ def __eq__(self, other): ) """This helps producing the README.""" - def as_constructor(self): + def as_constructor(self) -> str: return "ValidatedEmail(" \ + ",".join(f"\n {key}={repr(getattr(self, key))}" for key in ('normalized', 'local_part', 'domain', @@ -139,7 +139,7 @@ def as_constructor(self): + ")" """Convenience method for accessing ValidatedEmail as a dict""" - def as_dict(self): + def as_dict(self) -> Dict[str, Any]: d = self.__dict__ if d.get('domain_address'): d['domain_address'] = repr(d['domain_address']) diff --git a/email_validator/syntax.py b/email_validator/syntax.py index 5e52100..efbcd73 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -1,4 +1,4 @@ -from .exceptions_types import EmailSyntaxError +from .exceptions_types import EmailSyntaxError, ValidatedEmail from .rfc_constants import EMAIL_MAX_LENGTH, LOCAL_PART_MAX_LENGTH, DOMAIN_MAX_LENGTH, \ DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT_RE, ATEXT_INTL_DOT_RE, ATEXT_HOSTNAME_INTL, QTEXT_INTL, \ DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX, DOMAIN_LITERAL_CHARS @@ -7,10 +7,10 @@ import unicodedata import idna # implements IDNA 2008; Python's codec is only IDNA 2003 import ipaddress -from typing import Optional, TypedDict, Union +from typing import Optional, Tuple, TypedDict, Union -def split_email(email): +def split_email(email: str) -> Tuple[Optional[str], str, str, bool]: # Return the display name, unescaped local part, and domain part # of the address, and whether the local part was quoted. If no # display name was present and angle brackets do not surround @@ -46,7 +46,7 @@ def split_email(email): # We assume the input string is already stripped of leading and # trailing CFWS. - def split_string_at_unquoted_special(text, specials): + def split_string_at_unquoted_special(text: str, specials: Tuple[str, ...]) -> Tuple[str, str]: # Split the string at the first character in specials (an @-sign # or left angle bracket) that does not occur within quotes. inside_quote = False @@ -77,7 +77,7 @@ def split_string_at_unquoted_special(text, specials): return left_part, right_part - def unquote_quoted_string(text): + def unquote_quoted_string(text: str) -> Tuple[str, bool]: # Remove surrounding quotes and unescape escaped backslashes # and quotes. Escapes are parsed liberally. I think only # backslashes and quotes can be escaped but we'll allow anything @@ -155,7 +155,7 @@ def unquote_quoted_string(text): return display_name, local_part, domain_part, is_quoted_local_part -def get_length_reason(addr, utf8=False, limit=EMAIL_MAX_LENGTH): +def get_length_reason(addr: str, utf8: bool = False, limit: int = EMAIL_MAX_LENGTH) -> str: """Helper function to return an error message related to invalid length.""" diff = len(addr) - limit prefix = "at least " if utf8 else "" @@ -163,7 +163,7 @@ def get_length_reason(addr, utf8=False, limit=EMAIL_MAX_LENGTH): return f"({prefix}{diff} character{suffix} too many)" -def safe_character_display(c): +def safe_character_display(c: str) -> str: # Return safely displayable characters in quotes. if c == '\\': return f"\"{c}\"" # can't use repr because it escapes it @@ -351,7 +351,7 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp raise EmailSyntaxError("The email address contains invalid characters before the @-sign.") -def check_unsafe_chars(s, allow_space=False): +def check_unsafe_chars(s: str, allow_space: bool = False) -> None: # Check for unsafe characters or characters that would make the string # invalid or non-sensible Unicode. bad_chars = set() @@ -403,7 +403,7 @@ def check_unsafe_chars(s, allow_space=False): + ", ".join(safe_character_display(c) for c in sorted(bad_chars)) + ".") -def check_dot_atom(label, start_descr, end_descr, is_hostname): +def check_dot_atom(label: str, start_descr: str, end_descr: str, is_hostname: bool) -> None: # RFC 5322 3.2.3 if label.endswith("."): raise EmailSyntaxError(end_descr.format("period")) @@ -422,7 +422,12 @@ def check_dot_atom(label, start_descr, end_descr, is_hostname): raise EmailSyntaxError("An email address cannot have a period and a hyphen next to each other.") -def validate_email_domain_name(domain, test_environment=False, globally_deliverable=True): +class DomainNameValidationResult(TypedDict): + ascii_domain: str + domain: str + + +def validate_email_domain_name(domain: str, test_environment: bool = False, globally_deliverable: bool = True) -> DomainNameValidationResult: """Validates the syntax of the domain part of an email address.""" # Check for invalid characters before normalization. @@ -586,7 +591,7 @@ def validate_email_domain_name(domain, test_environment=False, globally_delivera } -def validate_email_length(addrinfo): +def validate_email_length(addrinfo: ValidatedEmail) -> None: # If the email address has an ASCII representation, then we assume it may be # transmitted in ASCII (we can't assume SMTPUTF8 will be used on all hops to # the destination) and the length limit applies to ASCII characters (which is @@ -627,7 +632,12 @@ def validate_email_length(addrinfo): raise EmailSyntaxError(f"The email address is too long {reason}.") -def validate_email_domain_literal(domain_literal): +class DomainLiteralValidationResult(TypedDict): + domain_address: Union[ipaddress.IPv4Address, ipaddress.IPv6Address] + domain: str + + +def validate_email_domain_literal(domain_literal: str) -> DomainLiteralValidationResult: # This is obscure domain-literal syntax. Parse it and return # a compressed/normalized address. # RFC 5321 4.1.3 and RFC 5322 3.4.1. diff --git a/email_validator/validate_email.py b/email_validator/validate_email.py index 3d851ee..0abcfd5 100644 --- a/email_validator/validate_email.py +++ b/email_validator/validate_email.py @@ -110,20 +110,20 @@ def validate_email( elif domain_part.startswith("[") and domain_part.endswith("]"): # Parse the address in the domain literal and get back a normalized domain. - domain_part_info = validate_email_domain_literal(domain_part[1:-1]) + domain_literal_info = validate_email_domain_literal(domain_part[1:-1]) if not allow_domain_literal: raise EmailSyntaxError("A bracketed IP address after the @-sign is not allowed here.") - ret.domain = domain_part_info["domain"] - ret.ascii_domain = domain_part_info["domain"] # Domain literals are always ASCII. - ret.domain_address = domain_part_info["domain_address"] + ret.domain = domain_literal_info["domain"] + ret.ascii_domain = domain_literal_info["domain"] # Domain literals are always ASCII. + ret.domain_address = domain_literal_info["domain_address"] is_domain_literal = True # Prevent deliverability checks. else: # Check the syntax of the domain and get back a normalized # internationalized and ASCII form. - domain_part_info = validate_email_domain_name(domain_part, test_environment=test_environment, globally_deliverable=globally_deliverable) - ret.domain = domain_part_info["domain"] - ret.ascii_domain = domain_part_info["ascii_domain"] + domain_name_info = validate_email_domain_name(domain_part, test_environment=test_environment, globally_deliverable=globally_deliverable) + ret.domain = domain_name_info["domain"] + ret.ascii_domain = domain_name_info["ascii_domain"] # Construct the complete normalized form. ret.normalized = ret.local_part + "@" + ret.domain diff --git a/pyproject.toml b/pyproject.toml index 5d3a28f..9515ace 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ check_untyped_defs = true disallow_incomplete_defs = true # disallow_untyped_calls = true disallow_untyped_decorators = true -# disallow_untyped_defs = true +disallow_untyped_defs = true warn_redundant_casts = true warn_unused_ignores = true diff --git a/tests/mocked_dns_response.py b/tests/mocked_dns_response.py index fc1f1a6..ddd4c94 100644 --- a/tests/mocked_dns_response.py +++ b/tests/mocked_dns_response.py @@ -1,3 +1,6 @@ +from typing import Any, Dict, Iterator, Optional + +import dns.rdataset import dns.resolver import json import os.path @@ -23,7 +26,7 @@ class MockedDnsResponseData: INSTANCE = None @staticmethod - def create_resolver(): + def create_resolver() -> dns.resolver.Resolver: if MockedDnsResponseData.INSTANCE is None: # Create a singleton instance of this class and load the saved DNS responses. # Except when BUILD_MOCKED_DNS_RESPONSE_DATA is true, don't load the data. @@ -37,20 +40,19 @@ def create_resolver(): dns_resolver = dns.resolver.Resolver(configure=BUILD_MOCKED_DNS_RESPONSE_DATA) return caching_resolver(cache=MockedDnsResponseData.INSTANCE, dns_resolver=dns_resolver) - def __init__(self): - self.data = {} - - def load(self): - # Loads the saved DNS response data from the JSON file and - # re-structures it into dnspython classes. - class Ans: # mocks the dns.resolver.Answer class + def __init__(self) -> None: + self.data: Dict[dns.resolver.CacheKey, Optional[MockedDnsResponseData.Ans]] = {} - def __init__(self, rrset): - self.rrset = rrset + # Loads the saved DNS response data from the JSON file and + # re-structures it into dnspython classes. + class Ans: # mocks the dns.resolver.Answer class + def __init__(self, rrset: dns.rdataset.Rdataset) -> None: + self.rrset = rrset - def __iter__(self): - return iter(self.rrset) + def __iter__(self) -> Iterator[Any]: + return iter(self.rrset) + def load(self) -> None: with open(self.DATA_PATH) as f: data = json.load(f) for item in data: @@ -62,11 +64,11 @@ def __iter__(self): for rr in item["answer"] ] if item["answer"]: - self.data[key] = Ans(dns.rdataset.from_rdata_list(0, rdatas=rdatas)) + self.data[key] = MockedDnsResponseData.Ans(dns.rdataset.from_rdata_list(0, rdatas=rdatas)) else: self.data[key] = None - def save(self): + def save(self) -> None: # Re-structure as a list with basic data types. data = [ { @@ -81,11 +83,12 @@ def save(self): ]) } for key, value in self.data.items() + if value is not None ] with open(self.DATA_PATH, "w") as f: json.dump(data, f, indent=True) - def get(self, key): + def get(self, key: dns.resolver.CacheKey) -> Optional[Ans]: # Special-case a domain to create a timeout. if key[0].to_text() == "timeout.com.": raise dns.exception.Timeout() @@ -108,7 +111,7 @@ def get(self, key): raise ValueError(f"Saved DNS data did not contain query: {key}") - def put(self, key, value): + def put(self, key: dns.resolver.CacheKey, value: Ans) -> None: # Build the DNS data by saving the live query response. if not BUILD_MOCKED_DNS_RESPONSE_DATA: raise ValueError("Should not get here.") @@ -116,8 +119,8 @@ def put(self, key, value): @pytest.fixture(scope="session", autouse=True) -def MockedDnsResponseDataCleanup(request): - def cleanup_func(): +def MockedDnsResponseDataCleanup(request: pytest.FixtureRequest) -> None: + def cleanup_func() -> None: if BUILD_MOCKED_DNS_RESPONSE_DATA and MockedDnsResponseData.INSTANCE is not None: MockedDnsResponseData.INSTANCE.save() request.addfinalizer(cleanup_func) diff --git a/tests/test_deliverability.py b/tests/test_deliverability.py index 0ed5c3f..b65116b 100644 --- a/tests/test_deliverability.py +++ b/tests/test_deliverability.py @@ -1,3 +1,5 @@ +from typing import Any, Dict + import pytest import re @@ -17,7 +19,7 @@ ('pages.github.com', {'mx': [(0, 'pages.github.com')], 'mx_fallback_type': 'A'}), ], ) -def test_deliverability_found(domain, expected_response): +def test_deliverability_found(domain: str, expected_response: str) -> None: response = validate_email_deliverability(domain, domain, dns_resolver=RESOLVER) assert response == expected_response @@ -35,7 +37,7 @@ def test_deliverability_found(domain, expected_response): ('justtxt.joshdata.me', 'The domain name {domain} does not accept email'), ], ) -def test_deliverability_fails(domain, error): +def test_deliverability_fails(domain: str, error: str) -> None: with pytest.raises(EmailUndeliverableError, match=error.format(domain=domain)): validate_email_deliverability(domain, domain, dns_resolver=RESOLVER) @@ -48,7 +50,7 @@ def test_deliverability_fails(domain, error): ('me@mail.example.com'), ], ) -def test_email_example_reserved_domain(email_input): +def test_email_example_reserved_domain(email_input: str) -> None: # Since these all fail deliverabiltiy from a static list, # DNS deliverability checks do not arise. with pytest.raises(EmailUndeliverableError) as exc_info: @@ -57,22 +59,22 @@ def test_email_example_reserved_domain(email_input): assert re.match(r"The domain name [a-z\.]+ does not (accept email|exist)\.", str(exc_info.value)) is not None -def test_deliverability_dns_timeout(): +def test_deliverability_dns_timeout() -> None: response = validate_email_deliverability('timeout.com', 'timeout.com', dns_resolver=RESOLVER) assert "mx" not in response assert response.get("unknown-deliverability") == "timeout" @pytest.mark.network -def test_caching_dns_resolver(): +def test_caching_dns_resolver() -> None: class TestCache: - def __init__(self): - self.cache = {} + def __init__(self) -> None: + self.cache: Dict[Any, Any] = {} - def get(self, key): + def get(self, key: Any) -> Any: return self.cache.get(key) - def put(self, key, value): + def put(self, key: Any, value: Any) -> Any: self.cache[key] = value cache = TestCache() diff --git a/tests/test_main.py b/tests/test_main.py index 579163f..ab8eecd 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -9,14 +9,14 @@ RESOLVER = MockedDnsResponseData.create_resolver() -def test_dict_accessor(): +def test_dict_accessor() -> None: input_email = "testaddr@example.tld" valid_email = validate_email(input_email, check_deliverability=False) assert isinstance(valid_email.as_dict(), dict) assert valid_email.as_dict()["original"] == input_email -def test_main_single_good_input(monkeypatch, capsys): +def test_main_single_good_input(monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: import json test_email = "google@google.com" monkeypatch.setattr('sys.argv', ['email_validator', test_email]) @@ -27,7 +27,7 @@ def test_main_single_good_input(monkeypatch, capsys): assert validate_email(test_email, dns_resolver=RESOLVER).original == output["original"] -def test_main_single_bad_input(monkeypatch, capsys): +def test_main_single_bad_input(monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: bad_email = 'test@..com' monkeypatch.setattr('sys.argv', ['email_validator', bad_email]) validator_command_line_tool(dns_resolver=RESOLVER) @@ -35,7 +35,7 @@ def test_main_single_bad_input(monkeypatch, capsys): assert stdout == 'An email address cannot have a period immediately after the @-sign.\n' -def test_main_multi_input(monkeypatch, capsys): +def test_main_multi_input(monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: import io test_cases = ["google1@google.com", "google2@google.com", "test@.com", "test3@.com"] test_input = io.StringIO("\n".join(test_cases)) @@ -49,7 +49,7 @@ def test_main_multi_input(monkeypatch, capsys): assert test_cases[3] in stdout -def test_bytes_input(): +def test_bytes_input() -> None: input_email = b"testaddr@example.tld" valid_email = validate_email(input_email, check_deliverability=False) assert isinstance(valid_email.as_dict(), dict) @@ -60,7 +60,7 @@ def test_bytes_input(): validate_email(input_email, check_deliverability=False) -def test_deprecation(): +def test_deprecation() -> None: input_email = b"testaddr@example.tld" valid_email = validate_email(input_email, check_deliverability=False) with pytest.deprecated_call(): diff --git a/tests/test_syntax.py b/tests/test_syntax.py index 65e3ec0..08551f5 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -133,7 +133,7 @@ ), ], ) -def test_email_valid(email_input, output): +def test_email_valid(email_input: str, output: ValidatedEmail) -> None: # These addresses do not require SMTPUTF8. See test_email_valid_intl_local_part # for addresses that are valid but require SMTPUTF8. Check that it passes with # allow_smtput8 both on and off. @@ -287,7 +287,7 @@ def test_email_valid(email_input, output): ), ], ) -def test_email_valid_intl_local_part(email_input, output): +def test_email_valid_intl_local_part(email_input: str, output: ValidatedEmail) -> None: # Check that it passes when allow_smtputf8 is True. assert validate_email(email_input, check_deliverability=False) == output @@ -309,7 +309,7 @@ def test_email_valid_intl_local_part(email_input, output): ('"quoted.with..unicode.λ"@example.com', '"quoted.with..unicode.λ"'), ('"quoted.with.extraneous.\\escape"@example.com', 'quoted.with.extraneous.escape'), ]) -def test_email_valid_only_if_quoted_local_part(email_input, normalized_local_part): +def test_email_valid_only_if_quoted_local_part(email_input: str, normalized_local_part: str) -> None: # These addresses are invalid with the default allow_quoted_local=False option. with pytest.raises(EmailSyntaxError) as exc_info: validate_email(email_input) @@ -323,7 +323,7 @@ def test_email_valid_only_if_quoted_local_part(email_input, normalized_local_par assert validated.local_part == normalized_local_part -def test_domain_literal(): +def test_domain_literal() -> None: # Check parsing IPv4 addresses. validated = validate_email("me@[127.0.0.1]", allow_domain_literal=True) assert validated.domain == "[127.0.0.1]" @@ -411,7 +411,7 @@ def test_domain_literal(): ('\"Display.Name\" ', 'A display name and angle brackets around the email address are not permitted here.'), ], ) -def test_email_invalid_syntax(email_input, error_msg): +def test_email_invalid_syntax(email_input: str, error_msg: str) -> None: # Since these all have syntax errors, deliverability # checks do not arise. with pytest.raises(EmailSyntaxError) as exc_info: @@ -430,7 +430,7 @@ def test_email_invalid_syntax(email_input, error_msg): ('me@test.test.test'), ], ) -def test_email_invalid_reserved_domain(email_input): +def test_email_invalid_reserved_domain(email_input: str) -> None: # Since these all fail deliverabiltiy from a static list, # DNS deliverability checks do not arise. with pytest.raises(EmailSyntaxError) as exc_info: @@ -454,7 +454,7 @@ def test_email_invalid_reserved_domain(email_input): ('\uFDEF', 'U+FDEF'), # unassigned (Cn) ], ) -def test_email_unsafe_character(s, expected_error): +def test_email_unsafe_character(s: str, expected_error: str) -> None: # Check for various unsafe characters that are permitted by the email # specs but should be disallowed for being unsafe or not sensible Unicode. @@ -474,26 +474,26 @@ def test_email_unsafe_character(s, expected_error): ('"quoted.with..unicode.λ"@example.com', 'Internationalized characters before the @-sign are not supported: \'λ\'.'), ], ) -def test_email_invalid_character_smtputf8_off(email_input, expected_error): +def test_email_invalid_character_smtputf8_off(email_input: str, expected_error: str) -> None: # Check that internationalized characters are rejected if allow_smtputf8=False. with pytest.raises(EmailSyntaxError) as exc_info: validate_email(email_input, allow_smtputf8=False, test_environment=True) assert str(exc_info.value) == expected_error -def test_email_empty_local(): +def test_email_empty_local() -> None: validate_email("@test", allow_empty_local=True, test_environment=True) # This next one might not be desirable. validate_email("\"\"@test", allow_empty_local=True, allow_quoted_local=True, test_environment=True) -def test_email_test_domain_name_in_test_environment(): +def test_email_test_domain_name_in_test_environment() -> None: validate_email("anything@test", test_environment=True) validate_email("anything@mycompany.test", test_environment=True) -def test_case_insensitive_mailbox_name(): +def test_case_insensitive_mailbox_name() -> None: validate_email("POSTMASTER@test", test_environment=True).normalized = "postmaster@test" validate_email("NOT-POSTMASTER@test", test_environment=True).normalized = "NOT-POSTMASTER@test" @@ -673,7 +673,7 @@ def test_case_insensitive_mailbox_name(): ['test.(comment)test@iana.org', 'ISEMAIL_DEPREC_COMMENT'] ] ) -def test_pyisemail_tests(email_input, status): +def test_pyisemail_tests(email_input: str, status: str) -> None: if status == "ISEMAIL_VALID": # All standard email address forms should not raise an exception # with any set of parsing options. From 9da50717822175585a34a4ea199eddff3b738155 Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Thu, 9 May 2024 06:14:06 -0400 Subject: [PATCH 144/174] mypy: disallow_untyped_calls --- pyproject.toml | 2 +- tests/mocked_dns_response.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9515ace..a92c08e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ disallow_subclassing_any = true check_untyped_defs = true disallow_incomplete_defs = true -# disallow_untyped_calls = true +disallow_untyped_calls = true disallow_untyped_decorators = true disallow_untyped_defs = true diff --git a/tests/mocked_dns_response.py b/tests/mocked_dns_response.py index ddd4c94..c6db5cb 100644 --- a/tests/mocked_dns_response.py +++ b/tests/mocked_dns_response.py @@ -1,5 +1,6 @@ from typing import Any, Dict, Iterator, Optional +import dns.exception import dns.rdataset import dns.resolver import json @@ -91,7 +92,7 @@ def save(self) -> None: def get(self, key: dns.resolver.CacheKey) -> Optional[Ans]: # Special-case a domain to create a timeout. if key[0].to_text() == "timeout.com.": - raise dns.exception.Timeout() + raise dns.exception.Timeout() # type: ignore [no-untyped-call] # When building the DNS response database, return # a cache miss. @@ -101,13 +102,13 @@ def get(self, key: dns.resolver.CacheKey) -> Optional[Ans]: # Query the data for a matching record. if key in self.data: if not self.data[key]: - raise dns.resolver.NoAnswer() + raise dns.resolver.NoAnswer() # type: ignore [no-untyped-call] return self.data[key] # Query the data for a response to an ANY query. ANY = dns.rdatatype.from_text("ANY") if (key[0], ANY, key[2]) in self.data and self.data[(key[0], ANY, key[2])] is None: - raise dns.resolver.NXDOMAIN() + raise dns.resolver.NXDOMAIN() # type: ignore [no-untyped-call] raise ValueError(f"Saved DNS data did not contain query: {key}") From be42a70480b23025f169fc2b9dfaceeb66502faa Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Thu, 9 May 2024 06:22:37 -0400 Subject: [PATCH 145/174] Run test_and_build on PR --- .github/workflows/test_and_build.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_and_build.yaml b/.github/workflows/test_and_build.yaml index 5268a2b..6cc4a07 100644 --- a/.github/workflows/test_and_build.yaml +++ b/.github/workflows/test_and_build.yaml @@ -1,6 +1,6 @@ name: Tests -on: [push] +on: [push, pull_request] jobs: build: From 380e44eaf8e2d48691f9c14358e49c6158db8973 Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Thu, 9 May 2024 06:50:50 -0400 Subject: [PATCH 146/174] Move setattr out of non-test code --- email_validator/deliverability.py | 14 +++++--- email_validator/exceptions_types.py | 7 +--- email_validator/validate_email.py | 6 ++-- tests/test_syntax.py | 55 +++++++++++++++++------------ 4 files changed, 47 insertions(+), 35 deletions(-) diff --git a/email_validator/deliverability.py b/email_validator/deliverability.py index 6800557..90f5f9a 100644 --- a/email_validator/deliverability.py +++ b/email_validator/deliverability.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, Optional +from typing import Any, List, Optional, Tuple, TypedDict import ipaddress @@ -18,7 +18,14 @@ def caching_resolver(*, timeout: Optional[int] = None, cache: Any = None, dns_re return resolver -def validate_email_deliverability(domain: str, domain_i18n: str, timeout: Optional[int] = None, dns_resolver: Optional[dns.resolver.Resolver] = None) -> Dict[str, str]: +DeliverabilityInfo = TypedDict("DeliverabilityInfo", { + "mx": List[Tuple[int, str]], + "mx_fallback_type": Optional[str], + "unknown-deliverability": str, +}, total=False) + + +def validate_email_deliverability(domain: str, domain_i18n: str, timeout: Optional[int] = None, dns_resolver: Optional[dns.resolver.Resolver] = None) -> DeliverabilityInfo: # Check that the domain resolves to an MX record. If there is no MX record, # try an A or AAAA record which is a deprecated fallback for deliverability. # Raises an EmailUndeliverableError on failure. On success, returns a dict @@ -36,7 +43,7 @@ def validate_email_deliverability(domain: str, domain_i18n: str, timeout: Option elif timeout is not None: raise ValueError("It's not valid to pass both timeout and dns_resolver.") - deliverability_info: Dict[str, Any] = {} + deliverability_info: DeliverabilityInfo = {} try: try: @@ -115,7 +122,6 @@ def is_global_addr(address: Any) -> bool: for rec in response: value = b"".join(rec.strings) if value.startswith(b"v=spf1 "): - deliverability_info["spf"] = value.decode("ascii", errors='replace') if value == b"v=spf1 -all": raise EmailUndeliverableError(f"The domain name {domain_i18n} does not send email.") except dns.resolver.NoAnswer: diff --git a/email_validator/exceptions_types.py b/email_validator/exceptions_types.py index e37bb9f..928a94f 100644 --- a/email_validator/exceptions_types.py +++ b/email_validator/exceptions_types.py @@ -60,16 +60,11 @@ class ValidatedEmail: """If no MX records are actually specified in DNS and instead are inferred, through an obsolete mechanism, from A or AAAA records, the value is the type of DNS record used instead (`A` or `AAAA`).""" - mx_fallback_type: str + mx_fallback_type: Optional[str] """The display name in the original input text, unquoted and unescaped, or None.""" display_name: Optional[str] - """Tests use this constructor.""" - def __init__(self, **kwargs: Any) -> None: - for k, v in kwargs.items(): - setattr(self, k, v) - def __repr__(self) -> str: return f"" diff --git a/email_validator/validate_email.py b/email_validator/validate_email.py index 0abcfd5..2adda2a 100644 --- a/email_validator/validate_email.py +++ b/email_validator/validate_email.py @@ -152,7 +152,9 @@ def validate_email( deliverability_info = validate_email_deliverability( ret.ascii_domain, ret.domain, timeout, dns_resolver ) - for key, value in deliverability_info.items(): - setattr(ret, key, value) + mx = deliverability_info.get("mx") + if mx is not None: + ret.mx = mx + ret.mx_fallback_type = deliverability_info.get("mx_fallback_type") return ret diff --git a/tests/test_syntax.py b/tests/test_syntax.py index 08551f5..de41253 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -1,3 +1,5 @@ +from typing import Any + import pytest from email_validator import EmailSyntaxError, \ @@ -5,12 +7,19 @@ ValidatedEmail +def MakeValidatedEmail(**kwargs: Any) -> ValidatedEmail: + ret = ValidatedEmail() + for k, v in kwargs.items(): + setattr(ret, k, v) + return ret + + @pytest.mark.parametrize( 'email_input,output', [ ( 'Abc@example.tld', - ValidatedEmail( + MakeValidatedEmail( local_part='Abc', ascii_local_part='Abc', smtputf8=False, @@ -22,7 +31,7 @@ ), ( 'Abc.123@test-example.com', - ValidatedEmail( + MakeValidatedEmail( local_part='Abc.123', ascii_local_part='Abc.123', smtputf8=False, @@ -34,7 +43,7 @@ ), ( 'user+mailbox/department=shipping@example.tld', - ValidatedEmail( + MakeValidatedEmail( local_part='user+mailbox/department=shipping', ascii_local_part='user+mailbox/department=shipping', smtputf8=False, @@ -46,7 +55,7 @@ ), ( "!#$%&'*+-/=?^_`.{|}~@example.tld", - ValidatedEmail( + MakeValidatedEmail( local_part="!#$%&'*+-/=?^_`.{|}~", ascii_local_part="!#$%&'*+-/=?^_`.{|}~", smtputf8=False, @@ -58,7 +67,7 @@ ), ( 'jeff@臺網中心.tw', - ValidatedEmail( + MakeValidatedEmail( local_part='jeff', ascii_local_part='jeff', smtputf8=False, @@ -70,7 +79,7 @@ ), ( '"quoted local part"@example.org', - ValidatedEmail( + MakeValidatedEmail( local_part='"quoted local part"', ascii_local_part='"quoted local part"', smtputf8=False, @@ -82,7 +91,7 @@ ), ( '"de-quoted.local.part"@example.org', - ValidatedEmail( + MakeValidatedEmail( local_part='de-quoted.local.part', ascii_local_part='de-quoted.local.part', smtputf8=False, @@ -94,7 +103,7 @@ ), ( 'MyName ', - ValidatedEmail( + MakeValidatedEmail( local_part='me', ascii_local_part='me', smtputf8=False, @@ -107,7 +116,7 @@ ), ( 'My Name ', - ValidatedEmail( + MakeValidatedEmail( local_part='me', ascii_local_part='me', smtputf8=False, @@ -120,7 +129,7 @@ ), ( r'"My.\"Na\\me\".Is" <"me \" \\ me"@example.org>', - ValidatedEmail( + MakeValidatedEmail( local_part=r'"me \" \\ me"', ascii_local_part=r'"me \" \\ me"', smtputf8=False, @@ -157,7 +166,7 @@ def test_email_valid(email_input: str, output: ValidatedEmail) -> None: [ ( '伊昭傑@郵件.商務', - ValidatedEmail( + MakeValidatedEmail( local_part='伊昭傑', smtputf8=True, ascii_domain='xn--5nqv22n.xn--lhr59c', @@ -167,7 +176,7 @@ def test_email_valid(email_input: str, output: ValidatedEmail) -> None: ), ( 'राम@मोहन.ईन्फो', - ValidatedEmail( + MakeValidatedEmail( local_part='राम', smtputf8=True, ascii_domain='xn--l2bl7a9d.xn--o1b8dj2ki', @@ -177,7 +186,7 @@ def test_email_valid(email_input: str, output: ValidatedEmail) -> None: ), ( 'юзер@екзампл.ком', - ValidatedEmail( + MakeValidatedEmail( local_part='юзер', smtputf8=True, ascii_domain='xn--80ajglhfv.xn--j1aef', @@ -187,7 +196,7 @@ def test_email_valid(email_input: str, output: ValidatedEmail) -> None: ), ( 'θσερ@εχαμπλε.ψομ', - ValidatedEmail( + MakeValidatedEmail( local_part='θσερ', smtputf8=True, ascii_domain='xn--mxahbxey0c.xn--xxaf0a', @@ -197,7 +206,7 @@ def test_email_valid(email_input: str, output: ValidatedEmail) -> None: ), ( '葉士豪@臺網中心.tw', - ValidatedEmail( + MakeValidatedEmail( local_part='葉士豪', smtputf8=True, ascii_domain='xn--fiqq24b10vi0d.tw', @@ -207,7 +216,7 @@ def test_email_valid(email_input: str, output: ValidatedEmail) -> None: ), ( '葉士豪@臺網中心.台灣', - ValidatedEmail( + MakeValidatedEmail( local_part='葉士豪', smtputf8=True, ascii_domain='xn--fiqq24b10vi0d.xn--kpry57d', @@ -217,7 +226,7 @@ def test_email_valid(email_input: str, output: ValidatedEmail) -> None: ), ( 'jeff葉@臺網中心.tw', - ValidatedEmail( + MakeValidatedEmail( local_part='jeff葉', smtputf8=True, ascii_domain='xn--fiqq24b10vi0d.tw', @@ -227,7 +236,7 @@ def test_email_valid(email_input: str, output: ValidatedEmail) -> None: ), ( 'ñoñó@example.tld', - ValidatedEmail( + MakeValidatedEmail( local_part='ñoñó', smtputf8=True, ascii_domain='example.tld', @@ -237,7 +246,7 @@ def test_email_valid(email_input: str, output: ValidatedEmail) -> None: ), ( '我買@example.tld', - ValidatedEmail( + MakeValidatedEmail( local_part='我買', smtputf8=True, ascii_domain='example.tld', @@ -247,7 +256,7 @@ def test_email_valid(email_input: str, output: ValidatedEmail) -> None: ), ( '甲斐黒川日本@example.tld', - ValidatedEmail( + MakeValidatedEmail( local_part='甲斐黒川日本', smtputf8=True, ascii_domain='example.tld', @@ -257,7 +266,7 @@ def test_email_valid(email_input: str, output: ValidatedEmail) -> None: ), ( 'чебурашкаящик-с-апельсинами.рф@example.tld', - ValidatedEmail( + MakeValidatedEmail( local_part='чебурашкаящик-с-апельсинами.рф', smtputf8=True, ascii_domain='example.tld', @@ -267,7 +276,7 @@ def test_email_valid(email_input: str, output: ValidatedEmail) -> None: ), ( 'उदाहरण.परीक्ष@domain.with.idn.tld', - ValidatedEmail( + MakeValidatedEmail( local_part='उदाहरण.परीक्ष', smtputf8=True, ascii_domain='domain.with.idn.tld', @@ -277,7 +286,7 @@ def test_email_valid(email_input: str, output: ValidatedEmail) -> None: ), ( 'ιωάννης@εεττ.gr', - ValidatedEmail( + MakeValidatedEmail( local_part='ιωάννης', smtputf8=True, ascii_domain='xn--qxaa9ba.gr', From 5cf49cf87478a421df21ffeff9a1c87e30470e09 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Fri, 10 May 2024 09:07:03 -0400 Subject: [PATCH 147/174] Move README section on unsafe Unicode to a later section since it applies to both the local part and the domain part --- README.md | 55 +++++++++++++++++++------------------------------------ 1 file changed, 19 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index 2c12c93..7b71ee4 100644 --- a/README.md +++ b/README.md @@ -184,8 +184,12 @@ Internationalized email addresses The email protocol SMTP and the domain name system DNS have historically only allowed English (ASCII) characters in email addresses and domain names, respectively. Each has adapted to internationalization in a separate -way, creating two separate aspects to email address -internationalization. +way, creating two separate aspects to email address internationalization. + +(If your mail submission library doesn't support Unicode at all, then +immediately prior to mail submission you must replace the email address with +its ASCII-ized form. This library gives you back the ASCII-ized form in the +`ascii_email` field in the returned object.) ### Internationalized domain names (IDN) @@ -208,6 +212,19 @@ email addresses, only English letters, numbers, and some punctuation (`._!#$%&'^``*+-=~/?{|}`) are allowed. In internationalized email address local parts, a wider range of Unicode characters are allowed. +Email addresses with these non-ASCII characters require that your mail +submission library and all the mail servers along the route to the destination, +including your own outbound mail server, all support the +[SMTPUTF8 (RFC 6531)](https://tools.ietf.org/html/rfc6531) extension. +Support for SMTPUTF8 varies. If you know ahead of time that SMTPUTF8 is not +supported by your mail submission stack, then you must filter out addresses that +require SMTPUTF8 using the `allow_smtputf8=False` keyword argument (see above). +This will cause the validation function to raise a `EmailSyntaxError` if +delivery would require SMTPUTF8. If you do not set `allow_smtputf8=False`, +you can also check the value of the `smtputf8` field in the returned object. + +### Unsafe Unicode characters are rejected + A surprisingly large number of Unicode characters are not safe to display, especially when the email address is concatenated with other text, so this library tries to protect you by not permitting reserved, non-, private use, @@ -226,40 +243,6 @@ with the normalized email address string returned by this library. This does not guard against the well known problem that many Unicode characters look alike (or are identical), which can be used to fool humans reading displayed text. -Email addresses with these non-ASCII characters require that your mail -submission library and the mail servers along the route to the destination, -including your own outbound mail server, all support the -[SMTPUTF8 (RFC 6531)](https://tools.ietf.org/html/rfc6531) extension. -Support for SMTPUTF8 varies. See the `allow_smtputf8` parameter. - -### If you know ahead of time that SMTPUTF8 is not supported by your mail submission stack - -By default all internationalized forms are accepted by the validator. -But if you know ahead of time that SMTPUTF8 is not supported by your -mail submission stack, then you must filter out addresses that require -SMTPUTF8 using the `allow_smtputf8=False` keyword argument (see above). -This will cause the validation function to raise a `EmailSyntaxError` if -delivery would require SMTPUTF8. That's just in those cases where -non-ASCII characters appear before the @-sign. If you do not set -`allow_smtputf8=False`, you can also check the value of the `smtputf8` -field in the returned object. - -If your mail submission library doesn't support Unicode at all --- even -in the domain part of the address --- then immediately prior to mail -submission you must replace the email address with its ASCII-ized form. -This library gives you back the ASCII-ized form in the `ascii_email` -field in the returned object, which you can get like this: - -```python -emailinfo = validate_email(email, allow_smtputf8=False) -email = emailinfo.ascii_email -``` - -The local part is left alone (if it has internationalized characters -`allow_smtputf8=False` will force validation to fail) and the domain -part is converted to [IDNA ASCII](https://tools.ietf.org/html/rfc5891). -(You probably should not do this at account creation time so you don't -change the user's login information without telling them.) Normalization ------------- From 8d2610ad0dc519befea020b07cc52c726bb1641e Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Thu, 13 Jun 2024 14:18:36 -0400 Subject: [PATCH 148/174] Fix the domain name length limit I previously copied the domain name length limit from the RFCs, but I misunderstood that "octets" in the RFCs didn't mean the number of characters in the ASCII domain name but the number of bytes as transmitted. When transmitted, the domain name has one byte for each label (part between periods) giving the label length. Those bytes correspond to the dots, except for the last label which doesn't have a dot, and the empty label which isn't printed. So the longest domain name length in characters is two less than what I thought. See https://stackoverflow.com/questions/32290167/what-is-the-maximum-length-of-a-dns-name for explanation. I noticed this when I saw that the idna package was rejecting domain names with 254 characters which this library accepted. --- CHANGELOG.md | 1 + email_validator/rfc_constants.py | 2 +- tests/test_syntax.py | 6 +++--- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1e41c4c..53c72bb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ In Development -------------- +* The domain name length limit is corrected from 255 to 253 IDNA ASCII characters. I misread the RFCs. * When a domain name has no MX record but does have an A or AAAA record, if none of the IP addresses in the response are globally reachable (i.e. not Private-Use, Loopback, etc.), the response is treated as if there was no A/AAAA response and the email address will fail the deliverability check. * When a domain name has no MX record but does have an A or AAAA record, the mx field in the object returned by validate_email incorrectly held the IP addresses rather than the domain itself. * Fixes in tests. diff --git a/email_validator/rfc_constants.py b/email_validator/rfc_constants.py index a6b9c59..2574c71 100644 --- a/email_validator/rfc_constants.py +++ b/email_validator/rfc_constants.py @@ -42,7 +42,7 @@ EMAIL_MAX_LENGTH = 254 LOCAL_PART_MAX_LENGTH = 64 DNS_LABEL_LENGTH_LIMIT = 63 # in "octets", RFC 1035 2.3.1 -DOMAIN_MAX_LENGTH = 255 # in "octets", RFC 1035 2.3.4 and RFC 5321 4.5.3.1.2 +DOMAIN_MAX_LENGTH = 253 # in "octets" as transmitted, RFC 1035 2.3.4 and RFC 5321 4.5.3.1.2, and see https://stackoverflow.com/questions/32290167/what-is-the-maximum-length-of-a-dns-name # RFC 2142 CASE_INSENSITIVE_MAILBOX_NAMES = [ diff --git a/tests/test_syntax.py b/tests/test_syntax.py index 693d7da..e5aecff 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -343,9 +343,9 @@ def test_domain_literal(): ('obsolete."quoted".atom@example.com', 'The email address contains invalid characters before the @-sign: \'"\'.'), ('11111111112222222222333333333344444444445555555555666666666677777@example.com', 'The email address is too long before the @-sign (1 character too many).'), ('111111111122222222223333333333444444444455555555556666666666777777@example.com', 'The email address is too long before the @-sign (2 characters too many).'), - ('me@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.111111111122222222223333333333444444444455555555556.com', 'The email address is too long (4 characters too many).'), - ('me@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555566.com', 'The email address is too long after the @-sign (1 character too many).'), - ('me@中1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555566.com', 'The email address is too long after the @-sign.'), + ('meme@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.com', 'The email address is too long (4 characters too many).'), + ('me@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333333344444444445555555555.com', 'The email address is too long after the @-sign (1 character too many).'), + ('me@中1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444.com', 'The email address is too long after the @-sign.'), ('my.long.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333333344444.info', 'The email address is too long (2 characters too many).'), ('my.long.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333.info', 'The email address is too long (when converted to IDNA ASCII).'), ('my.long.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (at least 1 character too many).'), From fd335321b1a19ab15143d63a67f0f97311aa4158 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sun, 16 Jun 2024 18:50:35 -0400 Subject: [PATCH 149/174] Bump test_requirements.txt --- test_requirements.txt | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/test_requirements.txt b/test_requirements.txt index d05813d..bea5d5a 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -7,20 +7,20 @@ # the earliest Python version we support, and some exception # messages may depend on package versions, so we pin versions # for reproducible testing.) -coverage==7.4.4 +coverage==7.5.3 dnspython==2.6.1 -exceptiongroup==1.2.0 -flake8==7.0.0 +exceptiongroup==1.2.1 +flake8==7.1.0 idna==3.7 iniconfig==2.0.0 mccabe==0.7.0 -mypy==1.9.0 +mypy==1.10.0 mypy-extensions==1.0.0 -packaging==24.0 -pluggy==1.4.0 -pycodestyle==2.11.1 +packaging==24.1 +pluggy==1.5.0 +pycodestyle==2.12.0 pyflakes==3.2.0 -pytest==8.1.1 +pytest==8.2.2 pytest-cov==5.0.0 tomli==2.0.1 -typing_extensions==4.11.0 +typing_extensions==4.12.2 From 077f5688f1cc019b36b238c9d4f674ef73fbe6fb Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sun, 16 Jun 2024 21:18:36 -0400 Subject: [PATCH 150/174] Version 2.1.2 --- .github/workflows/test_and_build.yaml | 2 +- CHANGELOG.md | 4 ++-- email_validator/version.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test_and_build.yaml b/.github/workflows/test_and_build.yaml index 5268a2b..9abb554 100644 --- a/.github/workflows/test_and_build.yaml +++ b/.github/workflows/test_and_build.yaml @@ -8,7 +8,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12.0"] + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] steps: - uses: actions/checkout@v3 diff --git a/CHANGELOG.md b/CHANGELOG.md index 53c72bb..a1944c8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,5 @@ -In Development --------------- +2.1.2 (June 16, 2024) +--------------------- * The domain name length limit is corrected from 255 to 253 IDNA ASCII characters. I misread the RFCs. * When a domain name has no MX record but does have an A or AAAA record, if none of the IP addresses in the response are globally reachable (i.e. not Private-Use, Loopback, etc.), the response is treated as if there was no A/AAAA response and the email address will fail the deliverability check. diff --git a/email_validator/version.py b/email_validator/version.py index 58039f5..4eabd0b 100644 --- a/email_validator/version.py +++ b/email_validator/version.py @@ -1 +1 @@ -__version__ = "2.1.1" +__version__ = "2.1.2" From 34268859ef24420a48f3658c00f11e577bbd25d7 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Tue, 11 Jun 2024 22:09:25 -0400 Subject: [PATCH 151/174] Several fixes for parsing display names * Fix error message text for input addresses without @-signs. The incorrect message was "There must be something after the @-sign.". This was broken by the changes to parse display names. Prior to that, the message was "The email address is not valid. It must have exactly one @-sign.". * Move the allow_display_name check to the end of the syntax checks. The optional checks should be the last to occur so that fatal syntax errors are raised first. * Check that display name email addresses have a closing angle bracket and nothing after. * Don't treat < + U+0338 (Combining Long Solidus Overlay) as the start of a bracketed email address. This would already be rejected because the combining character would be reported as an unsafe character at the start of the address, but it may be confusing since the caller won't see the address that way. When splitting the address into parts, skip the other special characters (@, quote, backslash) that have meaningful combining characters after them (i.e. they change under NFC normalization), although I don't think there are any such cases. --- email_validator/syntax.py | 27 ++++++++++++++++++++++++--- email_validator/validate_email.py | 7 +++++-- tests/test_syntax.py | 5 +++++ 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/email_validator/syntax.py b/email_validator/syntax.py index efbcd73..5d7af41 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -48,12 +48,22 @@ def split_email(email: str) -> Tuple[Optional[str], str, str, bool]: def split_string_at_unquoted_special(text: str, specials: Tuple[str, ...]) -> Tuple[str, str]: # Split the string at the first character in specials (an @-sign - # or left angle bracket) that does not occur within quotes. + # or left angle bracket) that does not occur within quotes and + # is not followed by a Unicode combining character. + # If no special character is found, raise an error. inside_quote = False escaped = False left_part = "" - for c in text: - if inside_quote: + for i, c in enumerate(text): + # < plus U+0338 (Combining Long Solidus Overlay) normalizes to + # ≮ U+226E (Not Less-Than), and it would be confusing to treat + # the < as the start of "" syntax in that case. Liekwise, + # if anything combines with an @ or ", we should probably not + # treat it as a special character. + if unicodedata.normalize("NFC", text[i:])[0] != c: + left_part += c + + elif inside_quote: left_part += c if c == '\\' and not escaped: escaped = True @@ -72,6 +82,9 @@ def split_string_at_unquoted_special(text: str, specials: Tuple[str, ...]) -> Tu else: left_part += c + if len(left_part) == len(text): + raise EmailSyntaxError("An email address must have an @-sign.") + # The right part is whatever is left. right_part = text[len(left_part):] @@ -134,6 +147,14 @@ def unquote_quoted_string(text: str) -> Tuple[str, bool]: # Check for other unsafe characters. check_unsafe_chars(display_name, allow_space=True) + # Check that the right part ends with an angle bracket + # but allow spaces after it, I guess. + if ">" not in right_part: + raise EmailSyntaxError("An open angle bracket at the start of the email address has to be followed by a close angle bracket at the end.") + right_part = right_part.rstrip(" ") + if right_part[-1] != ">": + raise EmailSyntaxError("There can't be anything after the email address.") + # Remove the initial and trailing angle brackets. addr_spec = right_part[1:].rstrip(">") diff --git a/email_validator/validate_email.py b/email_validator/validate_email.py index 2adda2a..19db902 100644 --- a/email_validator/validate_email.py +++ b/email_validator/validate_email.py @@ -68,8 +68,6 @@ def validate_email( # part if the local part is quoted. display_name, local_part, domain_part, is_quoted_local_part \ = split_email(email) - if display_name is not None and not allow_display_name: - raise EmailSyntaxError("A display name and angle brackets around the email address are not permitted here.") # Collect return values in this instance. ret = ValidatedEmail() @@ -139,6 +137,11 @@ def validate_email( # Check the length of the address. validate_email_length(ret) + # Check that a display name is permitted. It's the last syntax check + # because we always check against optional parsing features last. + if display_name is not None and not allow_display_name: + raise EmailSyntaxError("A display name and angle brackets around the email address are not permitted here.") + if check_deliverability and not test_environment: # Validate the email address's deliverability using DNS # and update the returned ValidatedEmail object with metadata. diff --git a/tests/test_syntax.py b/tests/test_syntax.py index 6d8dc72..d4a9844 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -352,6 +352,7 @@ def test_domain_literal() -> None: @pytest.mark.parametrize( 'email_input,error_msg', [ + ('hello.world', 'An email address must have an @-sign.'), ('my@localhost', 'The part after the @-sign is not valid. It should have a period.'), ('my@.leadingdot.com', 'An email address cannot have a period immediately after the @-sign.'), ('my@.leadingfwdot.com', 'An email address cannot have a period immediately after the @-sign.'), @@ -413,6 +414,10 @@ def test_domain_literal() -> None: ('me@[untaggedtext]', 'The part after the @-sign in brackets is not an IPv4 address and has no address literal tag.'), ('me@[tag:invalid space]', 'The part after the @-sign contains invalid characters in brackets: SPACE.'), ('', 'A display name and angle brackets around the email address are not permitted here.'), + (' !', 'There can\'t be anything after the email address.'), + ('<\u0338me@example.com', 'The email address contains invalid characters before the @-sign: \'<\'.'), + ('DisplayName ', 'An email address cannot have a hyphen immediately after the @-sign.'), ('DisplayName ', 'A display name and angle brackets around the email address are not permitted here.'), ('Display Name ', 'A display name and angle brackets around the email address are not permitted here.'), ('\"Display Name\" ', 'A display name and angle brackets around the email address are not permitted here.'), From 1fb55d4d654ec32903e7a1ed84530a5f3a0a38d6 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Tue, 11 Jun 2024 15:52:52 -0400 Subject: [PATCH 152/174] Add a test that shows that the local part is returned with Unicode NFC normalization s + U+0323 + U+0307 normalizes under NFC to U+1E69 (Latin Small Letter S With Dot Below And Dot Above) (https://www.unicode.org/reports/tr15/). We normalize when creating the returned email address info. --- tests/test_syntax.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/test_syntax.py b/tests/test_syntax.py index d4a9844..b150413 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -294,6 +294,16 @@ def test_email_valid(email_input: str, output: ValidatedEmail) -> None: normalized='ιωάννης@εεττ.gr', ), ), + ( + 's\u0323\u0307@nfc.tld', + MakeValidatedEmail( + local_part='\u1E69', + smtputf8=True, + ascii_domain='nfc.tld', + domain='nfc.tld', + normalized='\u1E69@nfc.tld', + ), + ), ], ) def test_email_valid_intl_local_part(email_input: str, output: ValidatedEmail) -> None: From 9ef1f829aa5dad1a936d822264181cfdcd03a576 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Tue, 11 Jun 2024 15:31:22 -0400 Subject: [PATCH 153/174] Check that the local part is valid after Unicode NFC normalization to prevent injection of invalid characters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We encourage callers to use the normalized email address returned by validate_email (in the `normalized` attribute). This form has had Unicode NFC normalization applied to the local part. However, all of the syntactic validation on the local part was performed before the normalization. Consequently, the normalization could change the local part to become invalid by the replacement of valid characters with invalid characters or by changing the length of the local part to exceed the maximum length. Callers who use the normalized form may then unexpectedly be using an invalid address. To ensure that callers do not get an invalid address, local part syntax checks are now repeated after Unicode normalization has been applied. A user submitted one case where NFC normalization changes a local part from valid to invalid: U+037E (Greek Question Mark)'s NFC normalization is the ASCII semicolon. The former is otherwise a permitted character, but ASCII semicolons are not permitted in local parts. The user noted that the semicolon could cause the address to be reinterpreted as a list and change the recipient of a message. No other Unicode character on its own is valid (in a local part) before normalization and invalid after --- I checked every character. I am not sure if there are character sequences that are valid before but not after normalization, but I can't yet find any: I checked that no Unicode character's NFD decomposition, when valid in a local part, normalizes under NFC to a sequence that is not valid. I also could not find any examples where NFC normalization changes something to or from a period, which could also change the validity of a local part. (The string '<' or '>' plus U+0338 (Combining Long Solidus Overlay) normalizes under NFC to ≮ U+226E (Not Less-Than) and ≯ U+226F (Not Greater-Than). The two-character sequences are not valid in a local part because < and > are not valid, although they are valid after NFC normalization. These addresses were rejected before and continue to be rejected. Although < could be the start of a bracketed email address if display names are permitted, the two-character sequence is now (in an earlier commit) is ignored for the purposes of parsing display names.) There are a small number of characters whose NFC normalization increases the string length, including U+FB2C (Hebrew Letter Shin With Dagesh And Shin Dot). This could also cause the local part to become invalid after normalization where it is valid before. This is now also caught by performing the syntax check again after normalization. (The whole-address length check is similarly fixed in a later commit.) Some checks that were previously only applied after normalization, for checking safe Unicode characters, are now also applied to the un-normalized form, which also may protect callers that ignore the normalized form and use the original email address string. However, I could not find an example where normalization turns an unsafe string into a safe string. See #142. --- CHANGELOG.md | 1 + README.md | 30 +++++++++++++++--------------- email_validator/syntax.py | 8 ++------ email_validator/validate_email.py | 15 +++++++++++++++ tests/test_syntax.py | 6 ++++-- 5 files changed, 37 insertions(+), 23 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9176582..d0e474b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ In Development -------------- +* Email addresses with internationalized local parts could, with rare Unicode characters, be returned as valid but actually be invalid in their normalized form (returned in the `normalized` field). Local parts now re-validated after Unicode NFC normalization to ensure that invalid characters cannot be injected into the normalized address and that characters with length-increasing NFC normalizations cannot cause a local part to exceed the maximum length after normalization. * A new option to parse `My Name ` strings, i.e. a display name plus an email address in angle brackets, is now available. It is off by default. 2.1.2 (June 16, 2024) diff --git a/README.md b/README.md index 7b71ee4..c054414 100644 --- a/README.md +++ b/README.md @@ -20,10 +20,11 @@ Key features: * Supports internationalized domain names (like `@ツ.life`), internationalized local parts (like `ツ@example.com`), and optionally parses display names (e.g. `"My Name" `). -* Rejects addresses with unsafe Unicode characters, obsolete email address - syntax that you'd find unexpected, special use domain names like - `@localhost`, and domains without a dot by default. This is an - opinionated library! +* Rejects addresses with invalid or unsafe Unicode characters, + obsolete email address syntax that you'd find unexpected, + special use domain names like `@localhost`, + and domains without a dot by default. + This is an opinionated library! * Normalizes email addresses (important for internationalized and quoted-string addresses! see below). * Python type annotations are used. @@ -235,13 +236,9 @@ cannot combine with something outside of the email address string or with the @-sign). See https://qntm.org/safe and https://trojansource.codes/ for relevant prior work. (Other than whitespace, these are checks that you should be applying to nearly all user inputs in a security-sensitive -context.) - -These character checks are performed after Unicode normalization (see below), -so you are only fully protected if you replace all user-provided email addresses -with the normalized email address string returned by this library. This does not -guard against the well known problem that many Unicode characters look alike -(or are identical), which can be used to fool humans reading displayed text. +context.) This does not guard against the well known problem that many +Unicode characters look alike, which can be used to fool humans reading +displayed text. Normalization @@ -257,7 +254,7 @@ address. For example, the CJK fullwidth Latin letters are considered semantically equivalent in domain names to their ASCII counterparts. This library -normalizes them to their ASCII counterparts: +normalizes them to their ASCII counterparts (as required by IDNA): ```python emailinfo = validate_email("me@Domain.com") @@ -270,9 +267,7 @@ Because an end-user might type their email address in different (but equivalent) un-normalized forms at different times, you ought to replace what they enter with the normalized form immediately prior to going into your database (during account creation), querying your database -(during login), or sending outbound mail. Normalization may also change -the length of an email address, and this may affect whether it is valid -and acceptable by your SMTP provider. +(during login), or sending outbound mail. The normalizations include lowercasing the domain part of the email address (domain names are case-insensitive), [Unicode "NFC" @@ -286,6 +281,11 @@ in the domain part, possibly other [UTS46](http://unicode.org/reports/tr46) mappings on the domain part, and conversion from Punycode to Unicode characters. +Normalization may change the characters in the email address and the +length of the email address, such that a string might be a valid address +before normalization but invalid after, or vice versa. This library only +permits addresses that are valid both before and after normalization. + (See [RFC 6532 (internationalized email) section 3.1](https://tools.ietf.org/html/rfc6532#section-3.1) and [RFC 5895 (IDNA 2008) section 2](http://www.ietf.org/rfc/rfc5895.txt).) diff --git a/email_validator/syntax.py b/email_validator/syntax.py index 5d7af41..670a6ea 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -315,12 +315,8 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp valid = "quoted" # If the local part matches the internationalized dot-atom form or was quoted, - # perform normalization and additional checks for Unicode strings. + # perform additional checks for Unicode strings. if valid: - # RFC 6532 section 3.1 says that Unicode NFC normalization should be applied, - # so we'll return the normalized local part in the return value. - local = unicodedata.normalize("NFC", local) - # Check that the local part is a valid, safe, and sensible Unicode string. # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked # by DOT_ATOM_TEXT_INTL and QTEXT_INTL. Other characters may be permitted by the @@ -385,7 +381,7 @@ def check_unsafe_chars(s: str, allow_space: bool = False) -> None: # Combining character in first position would combine with something # outside of the email address if concatenated, so they are not safe. # We also check if this occurs after the @-sign, which would not be - # sensible. + # sensible because it would modify the @-sign. if i == 0: bad_chars.add(c) elif category == "Zs": diff --git a/email_validator/validate_email.py b/email_validator/validate_email.py index 19db902..c5e852b 100644 --- a/email_validator/validate_email.py +++ b/email_validator/validate_email.py @@ -1,4 +1,5 @@ from typing import Optional, Union, TYPE_CHECKING +import unicodedata from .exceptions_types import EmailSyntaxError, ValidatedEmail from .syntax import split_email, validate_email_local_part, validate_email_domain_name, validate_email_domain_literal, validate_email_length @@ -86,6 +87,20 @@ def validate_email( ret.ascii_local_part = local_part_info["ascii_local_part"] ret.smtputf8 = local_part_info["smtputf8"] + # RFC 6532 section 3.1 says that Unicode NFC normalization should be applied, + # so we'll return the NFC-normalized local part. Since the caller may use that + # string in place of the original string, ensure it is also valid. + normalized_local_part = unicodedata.normalize("NFC", ret.local_part) + if normalized_local_part != ret.local_part: + try: + validate_email_local_part(normalized_local_part, + allow_smtputf8=allow_smtputf8, + allow_empty_local=allow_empty_local, + quoted_local_part=is_quoted_local_part) + except EmailSyntaxError as e: + raise EmailSyntaxError("After Unicode normalization: " + str(e)) from e + ret.local_part = normalized_local_part + # If a quoted local part isn't allowed but is present, now raise an exception. # This is done after any exceptions raised by validate_email_local_part so # that mandatory checks have highest precedence. diff --git a/tests/test_syntax.py b/tests/test_syntax.py index b150413..665ece1 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -398,14 +398,16 @@ def test_domain_literal() -> None: ('\nmy@example.com', 'The email address contains invalid characters before the @-sign: U+000A.'), ('m\ny@example.com', 'The email address contains invalid characters before the @-sign: U+000A.'), ('my\n@example.com', 'The email address contains invalid characters before the @-sign: U+000A.'), + ('me.\u037e@example.com', 'After Unicode normalization: The email address contains invalid characters before the @-sign: \';\'.'), ('test@\n', 'The part after the @-sign contains invalid characters: U+000A.'), ('bad"quotes"@example.com', 'The email address contains invalid characters before the @-sign: \'"\'.'), ('obsolete."quoted".atom@example.com', 'The email address contains invalid characters before the @-sign: \'"\'.'), ('11111111112222222222333333333344444444445555555555666666666677777@example.com', 'The email address is too long before the @-sign (1 character too many).'), ('111111111122222222223333333333444444444455555555556666666666777777@example.com', 'The email address is too long before the @-sign (2 characters too many).'), - ('meme@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.com', 'The email address is too long (4 characters too many).'), + ('\uFB2C111111122222222223333333333444444444455555555556666666666777777@example.com', 'After Unicode normalization: The email address is too long before the @-sign (2 characters too many).'), ('me@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333333344444444445555555555.com', 'The email address is too long after the @-sign (1 character too many).'), ('me@中1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444.com', 'The email address is too long after the @-sign.'), + ('meme@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.com', 'The email address is too long (4 characters too many).'), ('my.long.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333333344444.info', 'The email address is too long (2 characters too many).'), ('my.long.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333.info', 'The email address is too long (when converted to IDNA ASCII).'), ('my.long.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (at least 1 character too many).'), @@ -439,7 +441,7 @@ def test_email_invalid_syntax(email_input: str, error_msg: str) -> None: # Since these all have syntax errors, deliverability # checks do not arise. with pytest.raises(EmailSyntaxError) as exc_info: - validate_email(email_input) + validate_email(email_input, check_deliverability=False) assert str(exc_info.value) == error_msg From f8709e81b8a944658d5b22834c5867f308c4d4de Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Tue, 11 Jun 2024 14:56:27 -0400 Subject: [PATCH 154/174] Check that email address length is valid on the original email address string since callers may continue to use that string Previously, we checked that the ASCII email address (with IDNA ASCII) and the normalized email address satisfied the whole-address length limit. However, callers may use the original input string. Since Unicode NFC normalization typically reduces string length (if it changes the string), this can cause the post-normalization check to pass when the pre-normalization length is not valid. So we should additionally check that the original input also meets the maximum length requirement. Callers might also construct an address that has an internationalized local part and ASCII domain, maybe? So that's now checked too. The whole-address length test is revised to test each possible address format, first the original email address string (with any display name removed) so that exception messages correspond to the input string where possible. Then the normalized address is checked, since we encourage callers to use it. Then the ASCII address is checked since callers who send email without a SMTPUTF8-enabled stack will use this, or the normalized internationalized local part (there won't be an ASCII local part in this case) combined with the ASCII domain. Some length tests are added with a Unicode character whose NFC normalization is actually a decomposition: U+FB2C (Hebrew Letter Shin With Dagesh And Shin Dot) is unusual in that its NFC normalization actually expands it to multiple code points (https://www.unicode.org/faq/normalization.html). In these cases, the address will be valid before normalization but not valid after. See #142. --- CHANGELOG.md | 3 +- README.md | 7 --- email_validator/syntax.py | 101 ++++++++++++++++++------------ email_validator/validate_email.py | 4 +- tests/test_syntax.py | 13 ++-- 5 files changed, 75 insertions(+), 53 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d0e474b..14e67d6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ In Development -------------- * Email addresses with internationalized local parts could, with rare Unicode characters, be returned as valid but actually be invalid in their normalized form (returned in the `normalized` field). Local parts now re-validated after Unicode NFC normalization to ensure that invalid characters cannot be injected into the normalized address and that characters with length-increasing NFC normalizations cannot cause a local part to exceed the maximum length after normalization. +* The length check for email addresses with internationalized local parts is now also applied to the original address string prior to Unicode NFC normalization, which may be longer and could exceed the maximum email address length, to protect callers who do not use the returned normalized address. * A new option to parse `My Name ` strings, i.e. a display name plus an email address in angle brackets, is now available. It is off by default. 2.1.2 (June 16, 2024) @@ -10,7 +11,7 @@ In Development * The domain name length limit is corrected from 255 to 253 IDNA ASCII characters. I misread the RFCs. * When a domain name has no MX record but does have an A or AAAA record, if none of the IP addresses in the response are globally reachable (i.e. not Private-Use, Loopback, etc.), the response is treated as if there was no A/AAAA response and the email address will fail the deliverability check. * When a domain name has no MX record but does have an A or AAAA record, the mx field in the object returned by validate_email incorrectly held the IP addresses rather than the domain itself. -* Fixes in tests. +* Fixes in tests. Some additional tests added. 2.1.1 (February 26, 2024) ------------------------- diff --git a/README.md b/README.md index c054414..895dfa9 100644 --- a/README.md +++ b/README.md @@ -300,13 +300,6 @@ they are unnecessary. For IPv6 domain literals, the IPv6 address is normalized to condensed form. [RFC 2142](https://datatracker.ietf.org/doc/html/rfc2142) also requires lowercase normalization for some specific mailbox names like `postmaster@`. -### Length checks - -This library checks that the length of the email address is not longer than -the maximum length. The check is performed on the normalized form of the -address, which might be different from a string provided by a user. If you -send email to the original string and not the normalized address, the email -might be rejected because the original address could be too long. Examples -------- diff --git a/email_validator/syntax.py b/email_validator/syntax.py index 670a6ea..3375fa4 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -176,12 +176,11 @@ def unquote_quoted_string(text: str) -> Tuple[str, bool]: return display_name, local_part, domain_part, is_quoted_local_part -def get_length_reason(addr: str, utf8: bool = False, limit: int = EMAIL_MAX_LENGTH) -> str: +def get_length_reason(addr: str, limit: int) -> str: """Helper function to return an error message related to invalid length.""" diff = len(addr) - limit - prefix = "at least " if utf8 else "" suffix = "s" if diff > 1 else "" - return f"({prefix}{diff} character{suffix} too many)" + return f"({diff} character{suffix} too many)" def safe_character_display(c: str) -> str: @@ -609,44 +608,66 @@ def validate_email_domain_name(domain: str, test_environment: bool = False, glob def validate_email_length(addrinfo: ValidatedEmail) -> None: - # If the email address has an ASCII representation, then we assume it may be - # transmitted in ASCII (we can't assume SMTPUTF8 will be used on all hops to - # the destination) and the length limit applies to ASCII characters (which is - # the same as octets). The number of characters in the internationalized form - # may be many fewer (because IDNA ASCII is verbose) and could be less than 254 - # Unicode characters, and of course the number of octets over the limit may - # not be the number of characters over the limit, so if the email address is - # internationalized, we can't give any simple information about why the address - # is too long. - if addrinfo.ascii_email and len(addrinfo.ascii_email) > EMAIL_MAX_LENGTH: - if addrinfo.ascii_email == addrinfo.normalized: - reason = get_length_reason(addrinfo.ascii_email) - elif len(addrinfo.normalized) > EMAIL_MAX_LENGTH: - # If there are more than 254 characters, then the ASCII - # form is definitely going to be too long. - reason = get_length_reason(addrinfo.normalized, utf8=True) - else: - reason = "(when converted to IDNA ASCII)" - raise EmailSyntaxError(f"The email address is too long {reason}.") - - # In addition, check that the UTF-8 encoding (i.e. not IDNA ASCII and not - # Unicode characters) is at most 254 octets. If the addres is transmitted using - # SMTPUTF8, then the length limit probably applies to the UTF-8 encoded octets. - # If the email address has an ASCII form that differs from its internationalized - # form, I don't think the internationalized form can be longer, and so the ASCII - # form length check would be sufficient. If there is no ASCII form, then we have - # to check the UTF-8 encoding. The UTF-8 encoding could be up to about four times - # longer than the number of characters. + # There are three forms of the email address whose length must be checked: # - # See the length checks on the local part and the domain. - if len(addrinfo.normalized.encode("utf8")) > EMAIL_MAX_LENGTH: - if len(addrinfo.normalized) > EMAIL_MAX_LENGTH: - # If there are more than 254 characters, then the UTF-8 - # encoding is definitely going to be too long. - reason = get_length_reason(addrinfo.normalized, utf8=True) - else: - reason = "(when encoded in bytes)" - raise EmailSyntaxError(f"The email address is too long {reason}.") + # 1) The original email address string. Since callers may continue to use + # this string, even though we recommend using the normalized form, we + # should not pass validation when the original input is not valid. This + # form is checked first because it is the original input. + # 2) The normalized email address. We perform Unicode NFC normalization of + # the local part, we normalize the domain to internationalized characters + # (if originaly IDNA ASCII) which also includes Unicode normalization, + # and we may remove quotes in quoted local parts. We recommend that + # callers use this string, so it must be valid. + # 3) The email address with the IDNA ASCII representation of the domain + # name, since this string may be used with email stacks that don't + # support UTF-8. Since this is the least likely to be used by callers, + # it is checked last. Note that ascii_email will only be set if the + # local part is ASCII, but conceivably the caller may combine a + # internationalized local part with an ASCII domain, so we check this + # on that combination also. Since we only return the normalized local + # part, we use that (and not the unnormalized local part). + # + # In all cases, the length is checked in UTF-8 because the SMTPUTF8 + # extension to SMTP validates the length in bytes. + + addresses_to_check = [ + (addrinfo.original, None), + (addrinfo.normalized, "after normalization"), + ((addrinfo.ascii_local_part or addrinfo.local_part or "") + "@" + addrinfo.ascii_domain, "when the part after the @-sign is converted to IDNA ASCII"), + ] + + for addr, reason in addresses_to_check: + addr_len = len(addr) + addr_utf8_len = len(addr.encode("utf8")) + diff = addr_utf8_len - EMAIL_MAX_LENGTH + if diff > 0: + if reason is None and addr_len == addr_utf8_len: + # If there is no normalization or transcoding, + # we can give a simple count of the number of + # characters over the limit. + reason = get_length_reason(addr, limit=EMAIL_MAX_LENGTH) + elif reason is None: + # If there is no normalization but there is + # some transcoding to UTF-8, we can compute + # the minimum number of characters over the + # limit by dividing the number of bytes over + # the limit by the maximum number of bytes + # per character. + mbpc = max(len(c.encode("utf8")) for c in addr) + mchars = max(1, diff // mbpc) + suffix = "s" if diff > 1 else "" + if mchars == diff: + reason = f"({diff} character{suffix} too many)" + else: + reason = f"({mchars}-{diff} character{suffix} too many)" + else: + # Since there is normalization, the number of + # characters in the input that need to change is + # impossible to know. + suffix = "s" if diff > 1 else "" + reason += f" ({diff} byte{suffix} too many)" + raise EmailSyntaxError(f"The email address is too long {reason}.") class DomainLiteralValidationResult(TypedDict): diff --git a/email_validator/validate_email.py b/email_validator/validate_email.py index c5e852b..a134c77 100644 --- a/email_validator/validate_email.py +++ b/email_validator/validate_email.py @@ -72,7 +72,9 @@ def validate_email( # Collect return values in this instance. ret = ValidatedEmail() - ret.original = email + ret.original = ((local_part if not is_quoted_local_part + else ('"' + local_part + '"')) + + "@" + domain_part) # drop the display name, if any, for email length tests at the end ret.display_name = display_name # Validate the email address's local part syntax and get a normalized form. diff --git a/tests/test_syntax.py b/tests/test_syntax.py index 665ece1..f1f005a 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -409,10 +409,15 @@ def test_domain_literal() -> None: ('me@中1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444.com', 'The email address is too long after the @-sign.'), ('meme@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.com', 'The email address is too long (4 characters too many).'), ('my.long.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333333344444.info', 'The email address is too long (2 characters too many).'), - ('my.long.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333.info', 'The email address is too long (when converted to IDNA ASCII).'), - ('my.long.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (at least 1 character too many).'), - ('my.λong.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.111111111122222222223333333333444.info', 'The email address is too long (when encoded in bytes).'), - ('my.λong.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (at least 1 character too many).'), + ('my.long.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (1-2 characters too many).'), + ('my.long.address@\uFB2C111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (1-3 characters too many).'), + ('my.λong.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.111111111122222222223333333333444.info', 'The email address is too long (1 character too many).'), + ('my.λong.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (1-2 characters too many).'), + ('my.\u0073\u0323\u0307.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (1-2 characters too many).'), + ('my.\uFB2C.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333333344444.info', 'The email address is too long (1 character too many).'), + ('my.\uFB2C.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333333344.info', 'The email address is too long after normalization (1 byte too many).'), + ('my.long.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333.info', 'The email address is too long when the part after the @-sign is converted to IDNA ASCII (1 byte too many).'), + ('my.λong.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333.info', 'The email address is too long when the part after the @-sign is converted to IDNA ASCII (2 bytes too many).'), ('me@bad-tld-1', 'The part after the @-sign is not valid. It should have a period.'), ('me@bad.tld-2', 'The part after the @-sign is not valid. It is not within a valid top-level domain.'), ('me@xn--0.tld', 'The part after the @-sign is not valid IDNA (Invalid A-label).'), From 452e0ca12e8058701e957a16e8757c9722576037 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Mon, 17 Jun 2024 09:30:36 -0400 Subject: [PATCH 155/174] Add tests for domain label length --- tests/test_syntax.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_syntax.py b/tests/test_syntax.py index f1f005a..a2c2bb9 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -407,6 +407,9 @@ def test_domain_literal() -> None: ('\uFB2C111111122222222223333333333444444444455555555556666666666777777@example.com', 'After Unicode normalization: The email address is too long before the @-sign (2 characters too many).'), ('me@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333333344444444445555555555.com', 'The email address is too long after the @-sign (1 character too many).'), ('me@中1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444.com', 'The email address is too long after the @-sign.'), + ('me@1111111111222222222233333333334444444444555555555666666666677777.com', 'After the @-sign, periods cannot be separated by so many characters (1 character too many).'), + ('me@11111111112222222222333333333344444444445555555556666666666777777.com', 'After the @-sign, periods cannot be separated by so many characters (2 characters too many).'), + ('me@中111111111222222222233333333334444444444555555555666666.com', 'The part after the @-sign contains invalid characters (Label too long).'), ('meme@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.com', 'The email address is too long (4 characters too many).'), ('my.long.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333333344444.info', 'The email address is too long (2 characters too many).'), ('my.long.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (1-2 characters too many).'), From c23c0d66cbf407875bc645d1727bfdb9bc3a32b0 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Mon, 17 Jun 2024 09:31:45 -0400 Subject: [PATCH 156/174] Improve the error message for IDNA domains being too long by handling the length check ourselves rather than in idna.encode --- CHANGELOG.md | 1 + email_validator/syntax.py | 37 ++++++++++++++++++------------------- tests/test_syntax.py | 5 +++-- 3 files changed, 22 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 14e67d6..fcaa452 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ In Development * Email addresses with internationalized local parts could, with rare Unicode characters, be returned as valid but actually be invalid in their normalized form (returned in the `normalized` field). Local parts now re-validated after Unicode NFC normalization to ensure that invalid characters cannot be injected into the normalized address and that characters with length-increasing NFC normalizations cannot cause a local part to exceed the maximum length after normalization. * The length check for email addresses with internationalized local parts is now also applied to the original address string prior to Unicode NFC normalization, which may be longer and could exceed the maximum email address length, to protect callers who do not use the returned normalized address. +* Improved error message for IDNA domains that are too long. * A new option to parse `My Name ` strings, i.e. a display name plus an email address in angle brackets, is now available. It is off by default. 2.1.2 (June 16, 2024) diff --git a/email_validator/syntax.py b/email_validator/syntax.py index 3375fa4..31228c3 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -469,6 +469,7 @@ def validate_email_domain_name(domain: str, test_environment: bool = False, glob # such as "⒈" which is invalid because it would expand to include a dot. # Since several characters are normalized to a dot, this has to come before # checks related to dots, like check_dot_atom which comes next. + original_domain = domain try: domain = idna.uts46_remap(domain, std3_rules=False, transitional=False) except idna.IDNAError as e: @@ -498,29 +499,22 @@ def validate_email_domain_name(domain: str, test_environment: bool = False, glob # the MTA must either support SMTPUTF8 or the mail client must convert the # domain name to IDNA before submission. # - # Unfortunately this step incorrectly 'fixes' domain names with leading - # periods by removing them, so we have to check for this above. It also gives - # a funky error message ("No input") when there are two periods in a - # row, also checked separately above. - # # For ASCII-only domains, the transformation does nothing and is safe to # apply. However, to ensure we don't rely on the idna library for basic # syntax checks, we don't use it if it's not needed. # - # uts46 is off here because it is handled above. + # idna.encode also checks the domain name length after encoding but it + # doesn't give a nice error, so we call the underlying idna.alabel method + # directly. idna.alabel checks label length and doesn't give great messages, + # but we can't easily go to lower level methods. try: - ascii_domain = idna.encode(domain, uts46=False).decode("ascii") + ascii_domain = ".".join( + idna.alabel(label).decode("ascii") + for label in domain.split(".") + ) except idna.IDNAError as e: - if "Domain too long" in str(e): - # We can't really be more specific because UTS-46 normalization means - # the length check is applied to a string that is different from the - # one the user supplied. Also I'm not sure if the length check applies - # to the internationalized form, the IDNA ASCII form, or even both! - raise EmailSyntaxError("The email address is too long after the @-sign.") from e - - # Other errors seem to not be possible because the call to idna.uts46_remap - # would have already raised them. - raise EmailSyntaxError(f"The part after the @-sign contains invalid characters ({e}).") from e + # Some errors would have already been raised by idna.uts46_remap. + raise EmailSyntaxError(f"The part after the @-sign is invalid ({e}).") from e # Check the syntax of the string returned by idna.encode. # It should never fail. @@ -535,8 +529,13 @@ def validate_email_domain_name(domain: str, test_environment: bool = False, glob # as IDNA ASCII. (This is also checked by idna.encode, so this exception # is never reached for internationalized domains.) if len(ascii_domain) > DOMAIN_MAX_LENGTH: - reason = get_length_reason(ascii_domain, limit=DOMAIN_MAX_LENGTH) - raise EmailSyntaxError(f"The email address is too long after the @-sign {reason}.") + if ascii_domain == original_domain: + reason = get_length_reason(ascii_domain, limit=DOMAIN_MAX_LENGTH) + raise EmailSyntaxError(f"The email address is too long after the @-sign {reason}.") + else: + diff = len(ascii_domain) - DOMAIN_MAX_LENGTH + s = "" if diff == 1 else "s" + raise EmailSyntaxError(f"The email address is too long after the @-sign ({diff} byte{s} too many after IDNA encoding).") # Also check the label length limit. # (RFC 1035 2.3.1) diff --git a/tests/test_syntax.py b/tests/test_syntax.py index a2c2bb9..7de4150 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -406,10 +406,11 @@ def test_domain_literal() -> None: ('111111111122222222223333333333444444444455555555556666666666777777@example.com', 'The email address is too long before the @-sign (2 characters too many).'), ('\uFB2C111111122222222223333333333444444444455555555556666666666777777@example.com', 'After Unicode normalization: The email address is too long before the @-sign (2 characters too many).'), ('me@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333333344444444445555555555.com', 'The email address is too long after the @-sign (1 character too many).'), - ('me@中1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444.com', 'The email address is too long after the @-sign.'), + ('me@中1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444.com', 'The email address is too long after the @-sign (1 byte too many after IDNA encoding).'), + ('me@\uFB2C1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444.com', 'The email address is too long after the @-sign (5 bytes too many after IDNA encoding).'), ('me@1111111111222222222233333333334444444444555555555666666666677777.com', 'After the @-sign, periods cannot be separated by so many characters (1 character too many).'), ('me@11111111112222222222333333333344444444445555555556666666666777777.com', 'After the @-sign, periods cannot be separated by so many characters (2 characters too many).'), - ('me@中111111111222222222233333333334444444444555555555666666.com', 'The part after the @-sign contains invalid characters (Label too long).'), + ('me@中111111111222222222233333333334444444444555555555666666.com', 'The part after the @-sign is invalid (Label too long).'), ('meme@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.com', 'The email address is too long (4 characters too many).'), ('my.long.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333333344444.info', 'The email address is too long (2 characters too many).'), ('my.long.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (1-2 characters too many).'), From 7f1f281d4653f6cfa87416652a4d951ff6d35331 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Tue, 11 Jun 2024 22:41:23 -0400 Subject: [PATCH 157/174] Check domain syntax after normalization to internationalized characters as a precaution Out of caution that normalization of the domain part to internationalized characters could turn a valid domain string into an invalid one, it is re-parsed at the end to ensure that it still is validated by the idna package. I could not find any examples where that was not already caught, however, since it seems like the existing IDNA calls already prevent it. Some tests are added for invalid characters in the domain part which become invalid after Unicode NFC normalization. These were already handled. (The new code never raises an exception.) See #142. --- email_validator/syntax.py | 29 +++++++++++++++++++++++------ tests/test_syntax.py | 6 ++++++ 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/email_validator/syntax.py b/email_validator/syntax.py index 31228c3..78586c6 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -446,7 +446,7 @@ class DomainNameValidationResult(TypedDict): def validate_email_domain_name(domain: str, test_environment: bool = False, globally_deliverable: bool = True) -> DomainNameValidationResult: """Validates the syntax of the domain part of an email address.""" - # Check for invalid characters before normalization. + # Check for invalid characters. # (RFC 952 plus RFC 6531 section 3.3 for internationalized addresses) bad_chars = { safe_character_display(c) @@ -466,8 +466,9 @@ def validate_email_domain_name(domain: str, test_environment: bool = False, glob # and converting all label separators (the period/full stop, fullwidth full stop, # ideographic full stop, and halfwidth ideographic full stop) to regular dots. # It will also raise an exception if there is an invalid character in the input, - # such as "⒈" which is invalid because it would expand to include a dot. - # Since several characters are normalized to a dot, this has to come before + # such as "⒈" which is invalid because it would expand to include a dot and + # U+1FEF which normalizes to a backtick, which is not an allowed hostname character. + # Since several characters *are* normalized to a dot, this has to come before # checks related to dots, like check_dot_atom which comes next. original_domain = domain try: @@ -577,14 +578,23 @@ def validate_email_domain_name(domain: str, test_environment: bool = False, glob # but not be actual IDNA. For ASCII-only domains, the conversion out # of IDNA just gives the same thing back. # - # This gives us the canonical internationalized form of the domain. + # This gives us the canonical internationalized form of the domain, + # which we return to the caller as a part of the normalized email + # address. try: domain_i18n = idna.decode(ascii_domain.encode('ascii')) except idna.IDNAError as e: raise EmailSyntaxError(f"The part after the @-sign is not valid IDNA ({e}).") from e - # Check for invalid characters after normalization. These - # should never arise. See the similar checks above. + # Check that this normalized domain name has not somehow become + # an invalid domain name. All of the checks before this point + # using the idna package probably guarantee that we now have + # a valid international domain name in most respects. But it + # doesn't hurt to re-apply some tests to be sure. See the similar + # tests above. + + # Check for invalid and unsafe characters. We have no test + # case for this. bad_chars = { safe_character_display(c) for c in domain @@ -594,6 +604,13 @@ def validate_email_domain_name(domain: str, test_environment: bool = False, glob raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".") check_unsafe_chars(domain) + # Check that it can be encoded back to IDNA ASCII. We have no test + # case for this. + try: + idna.encode(domain_i18n) + except idna.IDNAError as e: + raise EmailSyntaxError(f"The part after the @-sign became invalid after normalizing to international characters ({e}).") from e + # Return the IDNA ASCII-encoded form of the domain, which is how it # would be transmitted on the wire (except when used with SMTPUTF8 # possibly), as well as the canonical Unicode form of the domain, diff --git a/tests/test_syntax.py b/tests/test_syntax.py index 7de4150..619932a 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -392,6 +392,12 @@ def test_domain_literal() -> None: ('me@⒈wouldbeinvalid.com', "The part after the @-sign contains invalid characters (Codepoint U+2488 not allowed " "at position 1 in '⒈wouldbeinvalid.com')."), + ('me@\u037e.com', + "The part after the @-sign is invalid (Codepoint U+003B at position 1 " + "of ';' not allowed)."), + ('me@\u1fef.com', + "The part after the @-sign is invalid (Codepoint U+0060 at position 1 " + "of '`' not allowed)."), ('@example.com', 'There must be something before the @-sign.'), ('white space@test', 'The email address contains invalid characters before the @-sign: SPACE.'), ('test@white space', 'The part after the @-sign contains invalid characters: SPACE.'), From 80513471731d9fadd65c6fe5694a229a56294beb Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Mon, 17 Jun 2024 09:55:19 -0400 Subject: [PATCH 158/174] Improve the error message for invalid characters in domain names after Unicode NFC normalization These cases were previously handled by the call to idna.encode or idna.alabel, but the error message wasn't consistent with similar checks we do for the local part. See #142. --- CHANGELOG.md | 2 +- email_validator/syntax.py | 10 ++++++++++ tests/test_syntax.py | 8 ++------ 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fcaa452..632b1ca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ In Development * Email addresses with internationalized local parts could, with rare Unicode characters, be returned as valid but actually be invalid in their normalized form (returned in the `normalized` field). Local parts now re-validated after Unicode NFC normalization to ensure that invalid characters cannot be injected into the normalized address and that characters with length-increasing NFC normalizations cannot cause a local part to exceed the maximum length after normalization. * The length check for email addresses with internationalized local parts is now also applied to the original address string prior to Unicode NFC normalization, which may be longer and could exceed the maximum email address length, to protect callers who do not use the returned normalized address. -* Improved error message for IDNA domains that are too long. +* Improved error message for IDNA domains that are too long or have invalid characters after Unicode normalization. * A new option to parse `My Name ` strings, i.e. a display name plus an email address in angle brackets, is now available. It is off by default. 2.1.2 (June 16, 2024) diff --git a/email_validator/syntax.py b/email_validator/syntax.py index 78586c6..c655451 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -476,6 +476,16 @@ def validate_email_domain_name(domain: str, test_environment: bool = False, glob except idna.IDNAError as e: raise EmailSyntaxError(f"The part after the @-sign contains invalid characters ({e}).") from e + # Check for invalid characters after Unicode normalization which are not caught + # by uts46_remap (see tests for examples). + bad_chars = { + safe_character_display(c) + for c in domain + if not ATEXT_HOSTNAME_INTL.match(c) + } + if bad_chars: + raise EmailSyntaxError("The part after the @-sign contains invalid characters after Unicode normalization: " + ", ".join(sorted(bad_chars)) + ".") + # The domain part is made up dot-separated "labels." Each label must # have at least one character and cannot start or end with dashes, which # means there are some surprising restrictions on periods and dashes. diff --git a/tests/test_syntax.py b/tests/test_syntax.py index 619932a..ffe4963 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -392,12 +392,8 @@ def test_domain_literal() -> None: ('me@⒈wouldbeinvalid.com', "The part after the @-sign contains invalid characters (Codepoint U+2488 not allowed " "at position 1 in '⒈wouldbeinvalid.com')."), - ('me@\u037e.com', - "The part after the @-sign is invalid (Codepoint U+003B at position 1 " - "of ';' not allowed)."), - ('me@\u1fef.com', - "The part after the @-sign is invalid (Codepoint U+0060 at position 1 " - "of '`' not allowed)."), + ('me@\u037e.com', "The part after the @-sign contains invalid characters after Unicode normalization: ';'."), + ('me@\u1fef.com', "The part after the @-sign contains invalid characters after Unicode normalization: '`'."), ('@example.com', 'There must be something before the @-sign.'), ('white space@test', 'The email address contains invalid characters before the @-sign: SPACE.'), ('test@white space', 'The part after the @-sign contains invalid characters: SPACE.'), From 6589b1e9ec2d9b9007603c0523bf96d70efb70c9 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Mon, 17 Jun 2024 10:02:35 -0400 Subject: [PATCH 159/174] Version 2.2.0 --- CHANGELOG.md | 8 +++++--- email_validator/version.py | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 632b1ca..2aea055 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,10 +1,12 @@ -In Development --------------- +2.2.0 (June 20, 2024) +--------------------- * Email addresses with internationalized local parts could, with rare Unicode characters, be returned as valid but actually be invalid in their normalized form (returned in the `normalized` field). Local parts now re-validated after Unicode NFC normalization to ensure that invalid characters cannot be injected into the normalized address and that characters with length-increasing NFC normalizations cannot cause a local part to exceed the maximum length after normalization. * The length check for email addresses with internationalized local parts is now also applied to the original address string prior to Unicode NFC normalization, which may be longer and could exceed the maximum email address length, to protect callers who do not use the returned normalized address. * Improved error message for IDNA domains that are too long or have invalid characters after Unicode normalization. * A new option to parse `My Name ` strings, i.e. a display name plus an email address in angle brackets, is now available. It is off by default. +* Improvements to Python typing. +* Some additional tests added. 2.1.2 (June 16, 2024) --------------------- @@ -12,7 +14,7 @@ In Development * The domain name length limit is corrected from 255 to 253 IDNA ASCII characters. I misread the RFCs. * When a domain name has no MX record but does have an A or AAAA record, if none of the IP addresses in the response are globally reachable (i.e. not Private-Use, Loopback, etc.), the response is treated as if there was no A/AAAA response and the email address will fail the deliverability check. * When a domain name has no MX record but does have an A or AAAA record, the mx field in the object returned by validate_email incorrectly held the IP addresses rather than the domain itself. -* Fixes in tests. Some additional tests added. +* Fixes in tests. 2.1.1 (February 26, 2024) ------------------------- diff --git a/email_validator/version.py b/email_validator/version.py index 4eabd0b..8a124bf 100644 --- a/email_validator/version.py +++ b/email_validator/version.py @@ -1 +1 @@ -__version__ = "2.1.2" +__version__ = "2.2.0" From 6d32d7a7bb3682891656ab91562c5df379cdf66f Mon Sep 17 00:00:00 2001 From: "Benjamin A. Beasley" Date: Fri, 21 Jun 2024 10:11:47 -0400 Subject: [PATCH 160/174] Fix a minor typo in README.md (validing/validating) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 895dfa9..5c1af43 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ users by their email address like on a registration form. Key features: * Checks that an email address has the correct syntax --- great for - email-based registration/login forms or validing data. + email-based registration/login forms or validating data. * Gives friendly English error messages when validation fails that you can display to end-users. * Checks deliverability (optional): Does the domain name resolve? From 1b2be12df83498144b5d30b05d49ad9798a681e8 Mon Sep 17 00:00:00 2001 From: "Benjamin A. Beasley" Date: Fri, 21 Jun 2024 10:12:38 -0400 Subject: [PATCH 161/174] Fix a minor typo in CHANGELOG.md (Verison/Version) --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2aea055..4bf9451 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -77,7 +77,7 @@ Version 1.2.1 (May 1, 2022) * example.com/net/org are removed from the special-use reserved domain names list so that they do not raise exceptions if check_deliverability is off. * Improved README. -Verison 1.2.0 (April 24, 2022) +Version 1.2.0 (April 24, 2022) ------------------------------ * Reject domains with NULL MX records (when deliverability checks From 8e1f67e47d789cd88ac6e016b187bbf9be805729 Mon Sep 17 00:00:00 2001 From: "Benjamin A. Beasley" Date: Fri, 21 Jun 2024 10:13:34 -0400 Subject: [PATCH 162/174] Fix minor typos in code comments --- email_validator/__main__.py | 2 +- email_validator/syntax.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/email_validator/__main__.py b/email_validator/__main__.py index 52791c7..caa111b 100644 --- a/email_validator/__main__.py +++ b/email_validator/__main__.py @@ -38,7 +38,7 @@ def main(dns_resolver: Optional[_Resolver] = None) -> None: options[varname.lower()] = float(os.environ[varname]) if len(sys.argv) == 1: - # Validate the email addresses pased line-by-line on STDIN. + # Validate the email addresses passed line-by-line on STDIN. dns_resolver = dns_resolver or caching_resolver() for line in sys.stdin: email = line.strip() diff --git a/email_validator/syntax.py b/email_validator/syntax.py index c655451..f53fd5b 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -57,7 +57,7 @@ def split_string_at_unquoted_special(text: str, specials: Tuple[str, ...]) -> Tu for i, c in enumerate(text): # < plus U+0338 (Combining Long Solidus Overlay) normalizes to # ≮ U+226E (Not Less-Than), and it would be confusing to treat - # the < as the start of "" syntax in that case. Liekwise, + # the < as the start of "" syntax in that case. Likewise, # if anything combines with an @ or ", we should probably not # treat it as a special character. if unicodedata.normalize("NFC", text[i:])[0] != c: @@ -642,7 +642,7 @@ def validate_email_length(addrinfo: ValidatedEmail) -> None: # form is checked first because it is the original input. # 2) The normalized email address. We perform Unicode NFC normalization of # the local part, we normalize the domain to internationalized characters - # (if originaly IDNA ASCII) which also includes Unicode normalization, + # (if originally IDNA ASCII) which also includes Unicode normalization, # and we may remove quotes in quoted local parts. We recommend that # callers use this string, so it must be valid. # 3) The email address with the IDNA ASCII representation of the domain From 1628852cf20a729ffec7b7a0bb620cbb3c3dce4a Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Fri, 21 Jun 2024 10:34:32 -0400 Subject: [PATCH 163/174] Expand description and add credit for the local part normalization validation issue --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4bf9451..b2d3c55 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,7 @@ 2.2.0 (June 20, 2024) --------------------- -* Email addresses with internationalized local parts could, with rare Unicode characters, be returned as valid but actually be invalid in their normalized form (returned in the `normalized` field). Local parts now re-validated after Unicode NFC normalization to ensure that invalid characters cannot be injected into the normalized address and that characters with length-increasing NFC normalizations cannot cause a local part to exceed the maximum length after normalization. +* Email addresses with internationalized local parts could, with rare Unicode characters, be returned as valid but actually be invalid in their normalized form (returned in the `normalized` field). In particular, it is possible to get a normalized address with a ";" character, which is not valid and could change the interpretation of the address. Local parts now re-validated after Unicode NFC normalization to ensure that invalid characters cannot be injected into the normalized address and that characters with length-increasing NFC normalizations cannot cause a local part to exceed the maximum length after normalization. Thanks to khanh@calif.io from https://calif.io for reporting the issue. * The length check for email addresses with internationalized local parts is now also applied to the original address string prior to Unicode NFC normalization, which may be longer and could exceed the maximum email address length, to protect callers who do not use the returned normalized address. * Improved error message for IDNA domains that are too long or have invalid characters after Unicode normalization. * A new option to parse `My Name ` strings, i.e. a display name plus an email address in angle brackets, is now available. It is off by default. From 3ebcc056c8717de76037e8d307edf9040bce2eea Mon Sep 17 00:00:00 2001 From: Ben Beasley Date: Sat, 22 Jun 2024 08:35:54 -0400 Subject: [PATCH 164/174] Fix a couple of additional typos (#144) * Fix a typo in a comment (upprcase/uppercase) * Fix another typo (prefixd/prefixed) in README.md --- README.md | 2 +- email_validator/__main__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 5c1af43..5d4405c 100644 --- a/README.md +++ b/README.md @@ -143,7 +143,7 @@ The `validate_email` function also accepts the following keyword arguments `allow_quoted_local=False`: Set to `True` to allow obscure and potentially problematic email addresses in which the part of the address before the @-sign contains spaces, @-signs, or other surprising characters when the local part is surrounded in quotes (so-called quoted-string local parts). In the object returned by `validate_email`, the normalized local part removes any unnecessary backslash-escaping and even removes the surrounding quotes if the address would be valid without them. You can also set `email_validator.ALLOW_QUOTED_LOCAL` to `True` to turn this on for all calls by default. -`allow_domain_literal=False`: Set to `True` to allow bracketed IPv4 and "IPv6:"-prefixd IPv6 addresses in the domain part of the email address. No deliverability checks are performed for these addresses. In the object returned by `validate_email`, the normalized domain will use the condensed IPv6 format, if applicable. The object's `domain_address` attribute will hold the parsed `ipaddress.IPv4Address` or `ipaddress.IPv6Address` object if applicable. You can also set `email_validator.ALLOW_DOMAIN_LITERAL` to `True` to turn this on for all calls by default. +`allow_domain_literal=False`: Set to `True` to allow bracketed IPv4 and "IPv6:"-prefixed IPv6 addresses in the domain part of the email address. No deliverability checks are performed for these addresses. In the object returned by `validate_email`, the normalized domain will use the condensed IPv6 format, if applicable. The object's `domain_address` attribute will hold the parsed `ipaddress.IPv4Address` or `ipaddress.IPv6Address` object if applicable. You can also set `email_validator.ALLOW_DOMAIN_LITERAL` to `True` to turn this on for all calls by default. `allow_display_name=False`: Set to `True` to allow a display name and bracketed address in the input string, like `My Name `. It's implemented in the spirit but not the letter of RFC 5322 3.4, so it may be stricter or more relaxed than what you want. The display name, if present, is provided in the returned object's `display_name` field after being unquoted and unescaped. You can also set `email_validator.ALLOW_DISPLAY_NAME` to `True` to turn this on for all calls by default. diff --git a/email_validator/__main__.py b/email_validator/__main__.py index caa111b..4b69437 100644 --- a/email_validator/__main__.py +++ b/email_validator/__main__.py @@ -12,7 +12,7 @@ # When using STDIN, no output will be given for valid email addresses. # # Keyword arguments to validate_email can be set in environment variables -# of the same name but upprcase (see below). +# of the same name but uppercase (see below). import json import os From 7c22208ee5b82c377e960ddcea5293691eadc6cc Mon Sep 17 00:00:00 2001 From: Ben Beasley Date: Sat, 22 Jun 2024 08:36:30 -0400 Subject: [PATCH 165/174] Support ALLOW_DISPLAY_NAME and ALLOW_EMPTY_LOCAL in the CLI (#145) --- email_validator/__init__.py | 1 + email_validator/__main__.py | 3 ++- email_validator/validate_email.py | 6 ++++-- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/email_validator/__init__.py b/email_validator/__init__.py index 626aa00..726dd47 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -27,6 +27,7 @@ def caching_resolver(*args, **kwargs): # Default values for keyword arguments. ALLOW_SMTPUTF8 = True +ALLOW_EMPTY_LOCAL = False ALLOW_QUOTED_LOCAL = False ALLOW_DOMAIN_LITERAL = False ALLOW_DISPLAY_NAME = False diff --git a/email_validator/__main__.py b/email_validator/__main__.py index 4b69437..0289cf2 100644 --- a/email_validator/__main__.py +++ b/email_validator/__main__.py @@ -29,7 +29,8 @@ def main(dns_resolver: Optional[_Resolver] = None) -> None: # Set options from environment variables. options: Dict[str, Any] = {} - for varname in ('ALLOW_SMTPUTF8', 'ALLOW_QUOTED_LOCAL', 'ALLOW_DOMAIN_LITERAL', + for varname in ('ALLOW_SMTPUTF8', 'ALLOW_EMPTY_LOCAL', 'ALLOW_QUOTED_LOCAL', 'ALLOW_DOMAIN_LITERAL', + 'ALLOW_DISPLAY_NAME', 'GLOBALLY_DELIVERABLE', 'CHECK_DELIVERABILITY', 'TEST_ENVIRONMENT'): if varname in os.environ: options[varname.lower()] = bool(os.environ[varname]) diff --git a/email_validator/validate_email.py b/email_validator/validate_email.py index a134c77..41ed262 100644 --- a/email_validator/validate_email.py +++ b/email_validator/validate_email.py @@ -17,7 +17,7 @@ def validate_email( /, # prior arguments are positional-only *, # subsequent arguments are keyword-only allow_smtputf8: Optional[bool] = None, - allow_empty_local: bool = False, + allow_empty_local: Optional[bool] = None, allow_quoted_local: Optional[bool] = None, allow_domain_literal: Optional[bool] = None, allow_display_name: Optional[bool] = None, @@ -34,10 +34,12 @@ def validate_email( """ # Fill in default values of arguments. - from . import ALLOW_SMTPUTF8, ALLOW_QUOTED_LOCAL, ALLOW_DOMAIN_LITERAL, ALLOW_DISPLAY_NAME, \ + from . import ALLOW_SMTPUTF8, ALLOW_EMPTY_LOCAL, ALLOW_QUOTED_LOCAL, ALLOW_DOMAIN_LITERAL, ALLOW_DISPLAY_NAME, \ GLOBALLY_DELIVERABLE, CHECK_DELIVERABILITY, TEST_ENVIRONMENT, DEFAULT_TIMEOUT if allow_smtputf8 is None: allow_smtputf8 = ALLOW_SMTPUTF8 + if allow_empty_local is None: + allow_empty_local = ALLOW_EMPTY_LOCAL if allow_quoted_local is None: allow_quoted_local = ALLOW_QUOTED_LOCAL if allow_domain_literal is None: From dbcf07cc5c8066c14b6dc58d2dbb4a1e582eeefd Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sat, 22 Jun 2024 11:23:29 -0400 Subject: [PATCH 166/174] Change package name from using underscore to dash to match PyPi normalized package name, fixes #138 --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 3387df1..8ceac96 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [metadata] -name = email_validator +name = email-validator version = attr: email_validator.version.__version__ description = A robust email address syntax and deliverability validation library. long_description = file: README.md From a1c90ab58fb0f5d969a8351a68ca15bff068527c Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Tue, 9 Jul 2024 06:44:03 -0400 Subject: [PATCH 167/174] Split exceptions_types.py into exceptions.py and types.py --- email_validator/__init__.py | 4 ++-- email_validator/__main__.py | 2 +- email_validator/deliverability.py | 2 +- email_validator/exceptions.py | 13 +++++++++++++ email_validator/syntax.py | 3 ++- email_validator/{exceptions_types.py => types.py} | 15 --------------- email_validator/validate_email.py | 3 ++- 7 files changed, 21 insertions(+), 21 deletions(-) create mode 100644 email_validator/exceptions.py rename email_validator/{exceptions_types.py => types.py} (92%) diff --git a/email_validator/__init__.py b/email_validator/__init__.py index 726dd47..d50a8d2 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -1,8 +1,8 @@ from typing import TYPE_CHECKING # Export the main method, helper methods, and the public data types. -from .exceptions_types import ValidatedEmail, EmailNotValidError, \ - EmailSyntaxError, EmailUndeliverableError +from .exceptions import EmailNotValidError, EmailSyntaxError, EmailUndeliverableError +from .types import ValidatedEmail from .validate_email import validate_email from .version import __version__ diff --git a/email_validator/__main__.py b/email_validator/__main__.py index 0289cf2..84d9fd4 100644 --- a/email_validator/__main__.py +++ b/email_validator/__main__.py @@ -21,7 +21,7 @@ from .validate_email import validate_email, _Resolver from .deliverability import caching_resolver -from .exceptions_types import EmailNotValidError +from .exceptions import EmailNotValidError def main(dns_resolver: Optional[_Resolver] = None) -> None: diff --git a/email_validator/deliverability.py b/email_validator/deliverability.py index 90f5f9a..6100a31 100644 --- a/email_validator/deliverability.py +++ b/email_validator/deliverability.py @@ -2,7 +2,7 @@ import ipaddress -from .exceptions_types import EmailUndeliverableError +from .exceptions import EmailUndeliverableError import dns.resolver import dns.exception diff --git a/email_validator/exceptions.py b/email_validator/exceptions.py new file mode 100644 index 0000000..87ef13c --- /dev/null +++ b/email_validator/exceptions.py @@ -0,0 +1,13 @@ +class EmailNotValidError(ValueError): + """Parent class of all exceptions raised by this module.""" + pass + + +class EmailSyntaxError(EmailNotValidError): + """Exception raised when an email address fails validation because of its form.""" + pass + + +class EmailUndeliverableError(EmailNotValidError): + """Exception raised when an email address fails validation because its domain name does not appear deliverable.""" + pass diff --git a/email_validator/syntax.py b/email_validator/syntax.py index f53fd5b..75837e6 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -1,4 +1,5 @@ -from .exceptions_types import EmailSyntaxError, ValidatedEmail +from .exceptions import EmailSyntaxError +from .types import ValidatedEmail from .rfc_constants import EMAIL_MAX_LENGTH, LOCAL_PART_MAX_LENGTH, DOMAIN_MAX_LENGTH, \ DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT_RE, ATEXT_INTL_DOT_RE, ATEXT_HOSTNAME_INTL, QTEXT_INTL, \ DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX, DOMAIN_LITERAL_CHARS diff --git a/email_validator/exceptions_types.py b/email_validator/types.py similarity index 92% rename from email_validator/exceptions_types.py rename to email_validator/types.py index 928a94f..1df60ff 100644 --- a/email_validator/exceptions_types.py +++ b/email_validator/types.py @@ -2,21 +2,6 @@ from typing import Any, Dict, List, Optional, Tuple, Union -class EmailNotValidError(ValueError): - """Parent class of all exceptions raised by this module.""" - pass - - -class EmailSyntaxError(EmailNotValidError): - """Exception raised when an email address fails validation because of its form.""" - pass - - -class EmailUndeliverableError(EmailNotValidError): - """Exception raised when an email address fails validation because its domain name does not appear deliverable.""" - pass - - class ValidatedEmail: """The validate_email function returns objects of this type holding the normalized form of the email address and other information.""" diff --git a/email_validator/validate_email.py b/email_validator/validate_email.py index 41ed262..2ab237b 100644 --- a/email_validator/validate_email.py +++ b/email_validator/validate_email.py @@ -1,7 +1,8 @@ from typing import Optional, Union, TYPE_CHECKING import unicodedata -from .exceptions_types import EmailSyntaxError, ValidatedEmail +from .exceptions import EmailSyntaxError +from .types import ValidatedEmail from .syntax import split_email, validate_email_local_part, validate_email_domain_name, validate_email_domain_literal, validate_email_length from .rfc_constants import CASE_INSENSITIVE_MAILBOX_NAMES From bc08faa2a74b51a9e7ba7ff4f995c0b475cb5b12 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Mon, 25 Nov 2024 08:43:53 -0500 Subject: [PATCH 168/174] Add one-off error messages for full-width-at and small-commercial-at which are not accepted for the @-sign --- email_validator/syntax.py | 21 +++++++++++++++++++++ tests/test_syntax.py | 12 ++++++++++++ 2 files changed, 33 insertions(+) diff --git a/email_validator/syntax.py b/email_validator/syntax.py index 75837e6..88edbd8 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -83,7 +83,28 @@ def split_string_at_unquoted_special(text: str, specials: Tuple[str, ...]) -> Tu else: left_part += c + # No special symbol found. The special symbols always + # include an at-sign, so this always indicates a missing + # at-sign. The other symbol is optional. if len(left_part) == len(text): + # The full-width at-sign might occur in CJK contexts. + # We can't accept it because we only accept addresess + # that are actually valid. But if this is common we + # may want to consider accepting and normalizing full- + # width characters for the other special symbols (and + # full-width dot is already accepted in internationalized + # domains) with a new option. + # See https://news.ycombinator.com/item?id=42235268. + if "@" in text: + raise EmailSyntaxError("The email address has the \"full-width\" at-sign (@) character instead of a regular at-sign.") + + # Check another near-homoglyph for good measure because + # homoglyphs in place of required characters could be + # very confusing. We may want to consider checking for + # homoglyphs anywhere we look for a special symbol. + if "﹫" in text: + raise EmailSyntaxError('The email address has the "small commercial at" character instead of a regular at-sign.') + raise EmailSyntaxError("An email address must have an @-sign.") # The right part is whatever is left. diff --git a/tests/test_syntax.py b/tests/test_syntax.py index ffe4963..29df3c0 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -304,6 +304,16 @@ def test_email_valid(email_input: str, output: ValidatedEmail) -> None: normalized='\u1E69@nfc.tld', ), ), + ( + '@@fullwidth.at', + MakeValidatedEmail( + local_part='@', + smtputf8=True, + ascii_domain='fullwidth.at', + domain='fullwidth.at', + normalized='@@fullwidth.at', + ), + ), ], ) def test_email_valid_intl_local_part(email_input: str, output: ValidatedEmail) -> None: @@ -363,6 +373,8 @@ def test_domain_literal() -> None: 'email_input,error_msg', [ ('hello.world', 'An email address must have an @-sign.'), + ('hello@world', 'The email address has the "full-width" at-sign (@) character instead of a regular at-sign.'), + ('hello﹫world', 'The email address has the "small commercial at" character instead of a regular at-sign.'), ('my@localhost', 'The part after the @-sign is not valid. It should have a period.'), ('my@.leadingdot.com', 'An email address cannot have a period immediately after the @-sign.'), ('my@.leadingfwdot.com', 'An email address cannot have a period immediately after the @-sign.'), From 8043de49596f08d54a07e2bc7c442ced074216a6 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Mon, 25 Nov 2024 09:50:48 -0500 Subject: [PATCH 169/174] NFC-normalize display names per UTS #39 --- email_validator/syntax.py | 4 ++-- email_validator/validate_email.py | 17 +++++++++++++++++ tests/test_syntax.py | 7 ++++--- 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/email_validator/syntax.py b/email_validator/syntax.py index 88edbd8..751ce3e 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -302,8 +302,8 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp valid = "dot-atom" requires_smtputf8 = True - # There are no syntactic restrictions on quoted local parts, so if - # it was originally quoted, it is probably valid. More characters + # There are no dot-atom syntax restrictions on quoted local parts, so + # if it was originally quoted, it is probably valid. More characters # are allowed, like @-signs, spaces, and quotes, and there are no # restrictions on the placement of dots, as in dot-atom local parts. elif quoted_local_part: diff --git a/email_validator/validate_email.py b/email_validator/validate_email.py index 2ab237b..0e8f6e0 100644 --- a/email_validator/validate_email.py +++ b/email_validator/validate_email.py @@ -73,6 +73,14 @@ def validate_email( display_name, local_part, domain_part, is_quoted_local_part \ = split_email(email) + if display_name: + # UTS #39 3.3 Email Security Profiles for Identifiers requires + # display names (incorrectly called "quoted-string-part" there) + # to be NFC normalized. Since these are not a part of what we + # are really validating, we won't check that the input was NFC + # normalized, but we'll normalize in output. + display_name = unicodedata.normalize("NFC", display_name) + # Collect return values in this instance. ret = ValidatedEmail() ret.original = ((local_part if not is_quoted_local_part @@ -95,6 +103,15 @@ def validate_email( # RFC 6532 section 3.1 says that Unicode NFC normalization should be applied, # so we'll return the NFC-normalized local part. Since the caller may use that # string in place of the original string, ensure it is also valid. + # + # UTS #39 3.3 Email Security Profiles for Identifiers requires local parts + # to be NFKC normalized, which loses some information in characters that can + # be decomposed. We might want to consider applying NFKC normalization, but + # we can't make the change easily because it would break database lookups + # for any caller that put a normalized address from a previous version of + # this library. (UTS #39 seems to require that the *input* be NKFC normalized + # and has other requirements that are hard to check without additional Unicode + # data, and I don't know whether the rules really apply in the wild.) normalized_local_part = unicodedata.normalize("NFC", ret.local_part) if normalized_local_part != ret.local_part: try: diff --git a/tests/test_syntax.py b/tests/test_syntax.py index 29df3c0..853cc5e 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -295,13 +295,14 @@ def test_email_valid(email_input: str, output: ValidatedEmail) -> None: ), ), ( - 's\u0323\u0307@nfc.tld', + '\"s\u0323\u0307\" ', MakeValidatedEmail( local_part='\u1E69', smtputf8=True, ascii_domain='nfc.tld', domain='nfc.tld', normalized='\u1E69@nfc.tld', + display_name='\u1E69' ), ), ( @@ -318,11 +319,11 @@ def test_email_valid(email_input: str, output: ValidatedEmail) -> None: ) def test_email_valid_intl_local_part(email_input: str, output: ValidatedEmail) -> None: # Check that it passes when allow_smtputf8 is True. - assert validate_email(email_input, check_deliverability=False) == output + assert validate_email(email_input, check_deliverability=False, allow_display_name=True) == output # Check that it fails when allow_smtputf8 is False. with pytest.raises(EmailSyntaxError) as exc_info: - validate_email(email_input, allow_smtputf8=False, check_deliverability=False) + validate_email(email_input, allow_smtputf8=False, check_deliverability=False, allow_display_name=True) assert "Internationalized characters before the @-sign are not supported: " in str(exc_info.value) From 936aead3bf5c608f8561954e0d2955b7f97bfdad Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Tue, 26 Nov 2024 12:01:31 -0500 Subject: [PATCH 170/174] Fix final syntax checks on normalized internationalized domains checking the wrong variable, but these checks had no tests and no known way to produce a fail --- email_validator/syntax.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/email_validator/syntax.py b/email_validator/syntax.py index 751ce3e..97eee7a 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -629,12 +629,12 @@ def validate_email_domain_name(domain: str, test_environment: bool = False, glob # case for this. bad_chars = { safe_character_display(c) - for c in domain + for c in domain_i18n if not ATEXT_HOSTNAME_INTL.match(c) } if bad_chars: raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".") - check_unsafe_chars(domain) + check_unsafe_chars(domain_i18n) # Check that it can be encoded back to IDNA ASCII. We have no test # case for this. From 98800bac023b8713351393a5043034065f1ea6cb Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Wed, 27 Nov 2024 21:11:20 -0500 Subject: [PATCH 171/174] Add explicit checks for internationalized domain name characters invalid under UTS-46 to improve the error message --- email_validator/syntax.py | 39 +++++++++++++++++++++++++++++++++++++++ tests/test_syntax.py | 4 +--- 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/email_validator/syntax.py b/email_validator/syntax.py index 97eee7a..1e34f49 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -460,6 +460,36 @@ def check_dot_atom(label: str, start_descr: str, end_descr: str, is_hostname: bo raise EmailSyntaxError("An email address cannot have a period and a hyphen next to each other.") +def uts46_valid_char(char: str) -> bool: + # By exhaustively searching for characters rejected by + # for c in (chr(i) for i in range(0x110000)): + # idna.uts46_remap(c, std3_rules=False, transitional=False) + # I found the following rules are pretty close. + c = ord(char) + if 0x80 <= c <= 0x9f: + # 8-bit ASCII range. + return False + elif ((0x2010 <= c <= 0x2060 and not (0x2024 <= c <= 0x2026) and not (0x2028 <= c <= 0x202E)) + or c in (0x00AD, 0x2064, 0xFF0E) + or 0x200B <= c <= 0x200D + or 0x1BCA0 <= c <= 0x1BCA3): + # Characters that are permitted but fall into one of the + # tests below. + return True + elif unicodedata.category(chr(c)) in ("Cf", "Cn", "Co", "Cs", "Zs", "Zl", "Zp"): + # There are a bunch of Zs characters including regular space + # that are allowed by UTS46 but are not allowed in domain + # names anyway. + # + # There are some Cn (unassigned) characters that the idna + # package doesn't reject but we can, I think. + return False + elif "002E" in unicodedata.decomposition(chr(c)).split(" "): + # Characters that decompose into a sequence with a dot. + return False + return True + + class DomainNameValidationResult(TypedDict): ascii_domain: str domain: str @@ -484,6 +514,15 @@ def validate_email_domain_name(domain: str, test_environment: bool = False, glob # they may not be valid, safe, or sensible Unicode strings. check_unsafe_chars(domain) + # Reject characters that would be rejected by UTS-46 normalization next but + # with an error message under our control. + bad_chars = { + safe_character_display(c) for c in domain + if not uts46_valid_char(c) + } + if bad_chars: + raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".") + # Perform UTS-46 normalization, which includes casefolding, NFC normalization, # and converting all label separators (the period/full stop, fullwidth full stop, # ideographic full stop, and halfwidth ideographic full stop) to regular dots. diff --git a/tests/test_syntax.py b/tests/test_syntax.py index 853cc5e..74ed2f3 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -402,9 +402,7 @@ def test_domain_literal() -> None: ('.leadingdot@domain.com', 'An email address cannot start with a period.'), ('twodots..here@domain.com', 'An email address cannot have two periods in a row.'), ('trailingdot.@domain.email', 'An email address cannot have a period immediately before the @-sign.'), - ('me@⒈wouldbeinvalid.com', - "The part after the @-sign contains invalid characters (Codepoint U+2488 not allowed " - "at position 1 in '⒈wouldbeinvalid.com')."), + ('me@⒈wouldbeinvalid.com', "The part after the @-sign contains invalid characters: '⒈'."), ('me@\u037e.com', "The part after the @-sign contains invalid characters after Unicode normalization: ';'."), ('me@\u1fef.com', "The part after the @-sign contains invalid characters after Unicode normalization: '`'."), ('@example.com', 'There must be something before the @-sign.'), From f90d256045dc1ccbcffd5514189267d14a9e3ea1 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Mon, 25 Aug 2025 10:28:08 -0400 Subject: [PATCH 172/174] Remove local part length check unless new strict flag is given, fixes #158 --- README.md | 1 + email_validator/__init__.py | 1 + email_validator/rfc_constants.py | 16 ++++++++++++++-- email_validator/syntax.py | 4 ++-- email_validator/validate_email.py | 11 ++++++++--- tests/test_syntax.py | 24 +++++++++++++++++++++--- 6 files changed, 47 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 5d4405c..0d1f0eb 100644 --- a/README.md +++ b/README.md @@ -150,6 +150,7 @@ The `validate_email` function also accepts the following keyword arguments `allow_empty_local=False`: Set to `True` to allow an empty local part (i.e. `@example.com`), e.g. for validating Postfix aliases. +`strict=False`: Set to `True` to perform additional syntax checks (currently only a local part length check). This should be used by mail service providers at address creation to ensure email addresses meet broad compatibility requirements. ### DNS timeout and cache diff --git a/email_validator/__init__.py b/email_validator/__init__.py index d50a8d2..38d0741 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -31,6 +31,7 @@ def caching_resolver(*args, **kwargs): ALLOW_QUOTED_LOCAL = False ALLOW_DOMAIN_LITERAL = False ALLOW_DISPLAY_NAME = False +STRICT = False GLOBALLY_DELIVERABLE = True CHECK_DELIVERABILITY = True TEST_ENVIRONMENT = False diff --git a/email_validator/rfc_constants.py b/email_validator/rfc_constants.py index 39d8e31..e93441b 100644 --- a/email_validator/rfc_constants.py +++ b/email_validator/rfc_constants.py @@ -36,12 +36,24 @@ QTEXT_INTL = re.compile(r"[\u0020-\u007E\u0080-\U0010FFFF]") # Length constants + # RFC 3696 + errata 1003 + errata 1690 (https://www.rfc-editor.org/errata_search.php?rfc=3696&eid=1690) -# explains the maximum length of an email address is 254 octets. +# explains the maximum length of an email address is 254 octets based on RFC 5321 4.5.3.1.3. A +# maximum local part length is also given at RFC 5321 4.5.3.1.1. +# +# But RFC 5321 4.5.3.1 says that these (and other) limits are in a sense suggestions, and longer +# local parts have been seen in the wild. Consequntely, the local part length is only checked +# in "strict" mode. Although the email address maximum length is also somewhat of a suggestion, +# I don't like the idea of having no length checks performed, so I'm leaving that to always be +# checked. EMAIL_MAX_LENGTH = 254 LOCAL_PART_MAX_LENGTH = 64 + +# Although RFC 5321 4.5.3.1.2 gives a (suggested, see above) limit of 255 octets, RFC 1035 2.3.4 also +# imposes a length limit (255 octets). But per https://stackoverflow.com/questions/32290167/what-is-the-maximum-length-of-a-dns-name, +# two of those octets are taken up by the optional final dot and null root label. DNS_LABEL_LENGTH_LIMIT = 63 # in "octets", RFC 1035 2.3.1 -DOMAIN_MAX_LENGTH = 253 # in "octets" as transmitted, RFC 1035 2.3.4 and RFC 5321 4.5.3.1.2, and see https://stackoverflow.com/questions/32290167/what-is-the-maximum-length-of-a-dns-name +DOMAIN_MAX_LENGTH = 253 # in "octets" as transmitted # RFC 2142 CASE_INSENSITIVE_MAILBOX_NAMES = [ diff --git a/email_validator/syntax.py b/email_validator/syntax.py index 1e34f49..0b1c7b0 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -229,7 +229,7 @@ class LocalPartValidationResult(TypedDict): def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_empty_local: bool = False, - quoted_local_part: bool = False) -> LocalPartValidationResult: + quoted_local_part: bool = False, strict: bool = False) -> LocalPartValidationResult: """Validates the syntax of the local part of an email address.""" if len(local) == 0: @@ -251,7 +251,7 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp # internationalized, then the UTF-8 encoding may be longer, but # that may not be relevant. We will check the total address length # instead. - if len(local) > LOCAL_PART_MAX_LENGTH: + if strict and len(local) > LOCAL_PART_MAX_LENGTH: reason = get_length_reason(local, limit=LOCAL_PART_MAX_LENGTH) raise EmailSyntaxError(f"The email address is too long before the @-sign {reason}.") diff --git a/email_validator/validate_email.py b/email_validator/validate_email.py index 0e8f6e0..c13ceee 100644 --- a/email_validator/validate_email.py +++ b/email_validator/validate_email.py @@ -22,6 +22,7 @@ def validate_email( allow_quoted_local: Optional[bool] = None, allow_domain_literal: Optional[bool] = None, allow_display_name: Optional[bool] = None, + strict: Optional[bool] = None, check_deliverability: Optional[bool] = None, test_environment: Optional[bool] = None, globally_deliverable: Optional[bool] = None, @@ -36,7 +37,7 @@ def validate_email( # Fill in default values of arguments. from . import ALLOW_SMTPUTF8, ALLOW_EMPTY_LOCAL, ALLOW_QUOTED_LOCAL, ALLOW_DOMAIN_LITERAL, ALLOW_DISPLAY_NAME, \ - GLOBALLY_DELIVERABLE, CHECK_DELIVERABILITY, TEST_ENVIRONMENT, DEFAULT_TIMEOUT + STRICT, GLOBALLY_DELIVERABLE, CHECK_DELIVERABILITY, TEST_ENVIRONMENT, DEFAULT_TIMEOUT if allow_smtputf8 is None: allow_smtputf8 = ALLOW_SMTPUTF8 if allow_empty_local is None: @@ -47,6 +48,8 @@ def validate_email( allow_domain_literal = ALLOW_DOMAIN_LITERAL if allow_display_name is None: allow_display_name = ALLOW_DISPLAY_NAME + if strict is None: + strict = STRICT if check_deliverability is None: check_deliverability = CHECK_DELIVERABILITY if test_environment is None: @@ -95,7 +98,8 @@ def validate_email( local_part_info = validate_email_local_part(local_part, allow_smtputf8=allow_smtputf8, allow_empty_local=allow_empty_local, - quoted_local_part=is_quoted_local_part) + quoted_local_part=is_quoted_local_part, + strict=strict) ret.local_part = local_part_info["local_part"] ret.ascii_local_part = local_part_info["ascii_local_part"] ret.smtputf8 = local_part_info["smtputf8"] @@ -118,7 +122,8 @@ def validate_email( validate_email_local_part(normalized_local_part, allow_smtputf8=allow_smtputf8, allow_empty_local=allow_empty_local, - quoted_local_part=is_quoted_local_part) + quoted_local_part=is_quoted_local_part, + strict=strict) except EmailSyntaxError as e: raise EmailSyntaxError("After Unicode normalization: " + str(e)) from e ret.local_part = normalized_local_part diff --git a/tests/test_syntax.py b/tests/test_syntax.py index 74ed2f3..9bd1385 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -415,9 +415,6 @@ def test_domain_literal() -> None: ('test@\n', 'The part after the @-sign contains invalid characters: U+000A.'), ('bad"quotes"@example.com', 'The email address contains invalid characters before the @-sign: \'"\'.'), ('obsolete."quoted".atom@example.com', 'The email address contains invalid characters before the @-sign: \'"\'.'), - ('11111111112222222222333333333344444444445555555555666666666677777@example.com', 'The email address is too long before the @-sign (1 character too many).'), - ('111111111122222222223333333333444444444455555555556666666666777777@example.com', 'The email address is too long before the @-sign (2 characters too many).'), - ('\uFB2C111111122222222223333333333444444444455555555556666666666777777@example.com', 'After Unicode normalization: The email address is too long before the @-sign (2 characters too many).'), ('me@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333333344444444445555555555.com', 'The email address is too long after the @-sign (1 character too many).'), ('me@中1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444.com', 'The email address is too long after the @-sign (1 byte too many after IDNA encoding).'), ('me@\uFB2C1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444.com', 'The email address is too long after the @-sign (5 bytes too many after IDNA encoding).'), @@ -467,6 +464,22 @@ def test_email_invalid_syntax(email_input: str, error_msg: str) -> None: assert str(exc_info.value) == error_msg +@pytest.mark.parametrize( + 'email_input,error_msg', + [ + ('11111111112222222222333333333344444444445555555555666666666677777@example.com', 'The email address is too long before the @-sign (1 character too many).'), + ('111111111122222222223333333333444444444455555555556666666666777777@example.com', 'The email address is too long before the @-sign (2 characters too many).'), + ('\uFB2C111111122222222223333333333444444444455555555556666666666777777@example.com', 'After Unicode normalization: The email address is too long before the @-sign (2 characters too many).'), + ]) +def test_email_invalid_syntax_strict(email_input: str, error_msg: str) -> None: + # Since these all have syntax errors, deliverability + # checks do not arise. + validate_email(email_input, check_deliverability=False) # pass without strict + with pytest.raises(EmailSyntaxError) as exc_info: + validate_email(email_input, strict=True, check_deliverability=False) + assert str(exc_info.value) == error_msg + + @pytest.mark.parametrize( 'email_input', [ @@ -728,6 +741,11 @@ def test_pyisemail_tests(email_input: str, status: str) -> None: validate_email(email_input, test_environment=True) validate_email(email_input, allow_quoted_local=True, allow_domain_literal=True, test_environment=True) + elif status == "ISEMAIL_RFC5322_LOCAL_TOOLONG": + # Requires strict. + with pytest.raises(EmailSyntaxError): + validate_email(email_input, strict=True, test_environment=True) + elif status == "ISEMAIL_RFC5321_QUOTEDSTRING": # Quoted-literal local parts are only valid with an option. with pytest.raises(EmailSyntaxError): From e943a0f07f5c130b4a419e0cd79f705f36bf24fe Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Mon, 25 Aug 2025 10:37:39 -0400 Subject: [PATCH 173/174] Raise TypeError when an invalid argument is passed for email, closes #155 --- email_validator/validate_email.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/email_validator/validate_email.py b/email_validator/validate_email.py index c13ceee..ae5d963 100644 --- a/email_validator/validate_email.py +++ b/email_validator/validate_email.py @@ -59,14 +59,18 @@ def validate_email( if timeout is None and dns_resolver is None: timeout = DEFAULT_TIMEOUT - # Allow email to be a str or bytes instance. If bytes, - # it must be ASCII because that's how the bytes work - # on the wire with SMTP. - if not isinstance(email, str): + if isinstance(email, str): + pass + elif isinstance(email, bytes): + # Allow email to be a bytes instance as if it is what + # will be transmitted on the wire. But assume SMTPUTF8 + # is unavailable, so it must be ASCII. try: email = email.decode("ascii") except ValueError as e: raise EmailSyntaxError("The email address is not valid ASCII.") from e + else: + raise TypeError("email must be str or bytes") # Split the address into the display name (or None), the local part # (before the @-sign), and the domain part (after the @-sign). From 030a63a183a6a66450e98381ca9a23ab9769706a Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Tue, 26 Aug 2025 09:03:08 -0400 Subject: [PATCH 174/174] Version 2.3.0 --- CHANGELOG.md | 9 +++++++++ email_validator/version.py | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b2d3c55..4874e9d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,12 @@ +2.3.0 (August 26, 2025) +----------------------- + +* The package name is changed from using an underscore (email_validator) to a dash (email-validator) to match PyPi's normalized package name. +* The library no longer checks that the local part is at most 64 characters because a more careful reading of RFC 5321 indicates the limit is optional and such email addresses have been found in the wild. However the check can be restored using a new `strict=True` parameter, and the overall 254 character email address length limit is still in place. +* New EmailSyntaxError messages are used for some exiting syntax errors related to @-sign homoglyphs and invalid characters in internationalized domains. +* When using `allow_display_name=True`, display names are now returned with Unicode NFC normalization. +* TypeError is now raised if something other than str (or bytes) is passed as the email address. + 2.2.0 (June 20, 2024) --------------------- diff --git a/email_validator/version.py b/email_validator/version.py index 8a124bf..55e4709 100644 --- a/email_validator/version.py +++ b/email_validator/version.py @@ -1 +1 @@ -__version__ = "2.2.0" +__version__ = "2.3.0"