diff --git a/email_validator/deliverability.py b/email_validator/deliverability.py index 6100a31..ff1e817 100644 --- a/email_validator/deliverability.py +++ b/email_validator/deliverability.py @@ -2,7 +2,10 @@ import ipaddress -from .exceptions import EmailUndeliverableError +from .exceptions import (EmailUndeliverableError, + EmailUndeliverableNullMxError, EmailUndeliverableNoMxError, + EmailUndeliverableFallbackDeniesSendingMailError, + EmailUndeliverableNoDomainError, EmailUndeliverableOtherError) import dns.resolver import dns.exception @@ -60,7 +63,7 @@ def validate_email_deliverability(domain: str, domain_i18n: str, timeout: Option mtas = [(preference, exchange) for preference, exchange in mtas if exchange != ""] if len(mtas) == 0: # null MX only, if there were no MX records originally a NoAnswer exception would have occurred - raise EmailUndeliverableError(f"The domain name {domain_i18n} does not accept email.") + raise EmailUndeliverableNullMxError(domain_i18n) deliverability_info["mx"] = mtas deliverability_info["mx_fallback_type"] = None @@ -110,7 +113,7 @@ def is_global_addr(address: Any) -> bool: # this domain is not deliverable, although the domain # name has other records (otherwise NXDOMAIN would # have been raised). - raise EmailUndeliverableError(f"The domain name {domain_i18n} does not accept email.") from e + raise EmailUndeliverableNoMxError(domain_i18n) from e # Check for a SPF (RFC 7208) reject-all record ("v=spf1 -all") which indicates # no emails are sent from this domain (similar to a Null MX record @@ -123,7 +126,7 @@ def is_global_addr(address: Any) -> bool: value = b"".join(rec.strings) if value.startswith(b"v=spf1 "): if value == b"v=spf1 -all": - raise EmailUndeliverableError(f"The domain name {domain_i18n} does not send email.") + raise EmailUndeliverableFallbackDeniesSendingMailError(domain_i18n) except dns.resolver.NoAnswer: # No TXT records means there is no SPF policy, so we cannot take any action. pass @@ -131,7 +134,7 @@ def is_global_addr(address: Any) -> bool: except dns.resolver.NXDOMAIN as e: # The domain name does not exist --- there are no records of any sort # for the domain name. - raise EmailUndeliverableError(f"The domain name {domain_i18n} does not exist.") from e + raise EmailUndeliverableNoDomainError(domain_i18n) from e except dns.resolver.NoNameservers: # All nameservers failed to answer the query. This might be a problem @@ -152,8 +155,6 @@ def is_global_addr(address: Any) -> bool: except Exception as e: # Unhandled conditions should not propagate. - raise EmailUndeliverableError( - "There was an error while checking if the domain name in the email address is deliverable: " + str(e) - ) from e + raise EmailUndeliverableOtherError(domain_i18n) from e return deliverability_info diff --git a/email_validator/exceptions.py b/email_validator/exceptions.py index 87ef13c..b63b5c1 100644 --- a/email_validator/exceptions.py +++ b/email_validator/exceptions.py @@ -1,3 +1,7 @@ +from dataclasses import dataclass +import unicodedata + + class EmailNotValidError(ValueError): """Parent class of all exceptions raised by this module.""" pass @@ -8,6 +12,153 @@ class EmailSyntaxError(EmailNotValidError): pass +class EmailSyntaxNoAtSignError(EmailSyntaxError): + """Exception raised when an email address is missing an @-sign.""" + def __str__(self): + return "An email address must have an @-sign." + + +@dataclass +class EmailSyntaxAtSignConfusedError(EmailSyntaxNoAtSignError): + """Exception raised when an email address is missing an @-sign but a confusable character is present.""" + character: str + def __str__(self): + return f"The email address has the {self.character} character instead of a regular at-sign." + + +def safe_character_display(c: str) -> str: + # Return safely displayable characters in quotes. + if c == '\\': + return f"\"{c}\"" # can't use repr because it escapes it + if unicodedata.category(c)[0] in ("L", "N", "P", "S"): + return repr(c) + + # Construct a hex string in case the unicode name doesn't exist. + if ord(c) < 0xFFFF: + h = f"U+{ord(c):04x}".upper() + else: + h = f"U+{ord(c):08x}".upper() + + # Return the character name or, if it has no name, the hex string. + return unicodedata.name(c, h) + + +@dataclass +class EmailInvalidCharactersError(EmailSyntaxError): + """Exception raised when an email address fails validation because it contains invalid characters.""" + characters: list[str] + def __str__(self): + return ", ".join(safe_character_display(c) for c in self.characters) + + +class EmailInvalidCharactersAfterQuotedString(EmailInvalidCharactersError): + """Exception raised when an email address fails validation because it contains invalid characters after a quoted string.""" + def __str__(self): + return "Extra character(s) found after close quote: " + EmailInvalidCharactersError.__str__(self) + "." + + +class EmailInvalidCharactersInUnquotedDisplayName(EmailInvalidCharactersError): + """Exception raised when an email address fails validation because it contains invalid characters after a quoted string.""" + def __str__(self): + return "The display name contains invalid characters when not quoted: " + EmailInvalidCharactersError.__str__(self) + "." + + +class EmailIntlCharactersInLocalPart(EmailInvalidCharactersError): + """Exception raised when an email address fails validation because it contains invalid characters after a quoted string.""" + def __str__(self): + return "Internationalized characters before the @-sign are not supported: " + EmailInvalidCharactersError.__str__(self) + "." + + +class EmailInvalidCharactersInLocalPart(EmailInvalidCharactersError): + """Exception raised when an email address fails validation because it contains invalid characters in the local part.""" + def __str__(self): + return "The email address contains invalid characters before the @-sign: " + EmailInvalidCharactersError.__str__(self) + "." + + +class EmailUnsafeCharactersError(EmailInvalidCharactersError): + """Exception raised when an email address fails validation because it contains invalid characters in the local part.""" + def __str__(self): + return "The email address contains unsafe characters: " + EmailInvalidCharactersError.__str__(self) + "." + + +class EmailInvalidCharactersInDomainPart(EmailInvalidCharactersError): + """Exception raised when an email address fails validation because it contains invalid characters after a quoted string.""" + def __str__(self): + return f"The part after the @-sign contains invalid characters: " + EmailInvalidCharactersError.__str__(self) + "." + + +class EmailInvalidCharactersInDomainPartAfterUnicodeNormalization(EmailInvalidCharactersError): + """Exception raised when an email address fails validation because it contains invalid characters after a quoted string.""" + def __str__(self): + return f"The part after the @-sign contains invalid characters after Unicode normalization: " + EmailInvalidCharactersError.__str__(self) + "." + + +class EmailInvalidCharactersInDomainAddressLiteral(EmailInvalidCharactersError): + """Exception raised when an email address fails validation because it contains invalid characters after a quoted string.""" + def __str__(self): + return f"The part after the @-sign contains invalid characters in brackets: " + EmailInvalidCharactersError.__str__(self) + "." + + +class EmailBracketedAddressMissingCloseBracket(EmailSyntaxError): + """Exception raised when an email address begins with an angle bracket but does not end with an angle bracket.""" + def __str__(self): + return "An open angle bracket at the start of the email address has to be followed by a close angle bracket at the end." + + +class EmailBracketedAddressExtraneousText(EmailSyntaxError): + """Exception raised when an email address in angle brackets has text after the angle brackets.""" + def __str__(self): + return "There can't be anything after the email address." + + +class EmailNoLocalPartError(EmailSyntaxError): + """Exception raised when an email address in angle brackets has text after the angle brackets.""" + def __str__(self): + return "There must be something before the @-sign." + + +@dataclass +class EmailUnhandledSyntaxError(EmailSyntaxError): + """Exception raised when an email address has an unhandled error.""" + message: str + def __str__(self): + return self.message + + +@dataclass class EmailUndeliverableError(EmailNotValidError): """Exception raised when an email address fails validation because its domain name does not appear deliverable.""" - pass + domain: str + + +@dataclass +class EmailUndeliverableNullMxError(EmailUndeliverableError): + """Exception raised when an email address fails validation because its domain name has a Null MX record indicating that it cannot receive mail.""" + # See https://www.rfc-editor.org/rfc/rfc7505. + def __str__(self): + return f"The domain name {self.domain} does not accept email." + +@dataclass +class EmailUndeliverableNoMxError(EmailUndeliverableError): + """Exception raised when an email address fails validation because its domain name has no MX, A, or AAAA record indicating how to deliver mail.""" + def __str__(self): + return f"The domain name {self.domain} does not accept email." + +@dataclass +class EmailUndeliverableFallbackDeniesSendingMailError(EmailUndeliverableError): + """Exception raised when an email address fails validation because its domain name has no MX record and it has a SPF record indicating it does not send mail.""" + def __str__(self): + return f"The domain name {self.domain} does not send email." + +@dataclass +class EmailUndeliverableNoDomainError(EmailUndeliverableError): + """Exception raised when an email address fails validation because its domain name does not exist in DNS.""" + def __str__(self): + return f"The domain name {self.domain} does not exist." + + +@dataclass +class EmailUndeliverableOtherError(EmailNotValidError): + """Exception raised when an email address fails validation because of an unhandled exception.""" + def __str__(self): + return "There was an error while checking if the domain name in the email address is deliverable." diff --git a/email_validator/syntax.py b/email_validator/syntax.py index 97eee7a..b038d87 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -1,4 +1,13 @@ -from .exceptions import EmailSyntaxError +from .exceptions import (EmailSyntaxError, + EmailSyntaxNoAtSignError, EmailSyntaxAtSignConfusedError, + EmailInvalidCharactersAfterQuotedString, + EmailInvalidCharactersInUnquotedDisplayName, + EmailIntlCharactersInLocalPart, EmailInvalidCharactersInLocalPart, + EmailUnsafeCharactersError, + EmailNoLocalPartError, + EmailBracketedAddressExtraneousText, EmailBracketedAddressMissingCloseBracket, + EmailInvalidCharactersInDomainPart, EmailInvalidCharactersInDomainPartAfterUnicodeNormalization, + EmailInvalidCharactersInDomainAddressLiteral) from .types import ValidatedEmail from .rfc_constants import EMAIL_MAX_LENGTH, LOCAL_PART_MAX_LENGTH, DOMAIN_MAX_LENGTH, \ DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT_RE, ATEXT_INTL_DOT_RE, ATEXT_HOSTNAME_INTL, QTEXT_INTL, \ @@ -96,16 +105,16 @@ def split_string_at_unquoted_special(text: str, specials: Tuple[str, ...]) -> Tu # domains) with a new option. # See https://news.ycombinator.com/item?id=42235268. if "@" in text: - raise EmailSyntaxError("The email address has the \"full-width\" at-sign (@) character instead of a regular at-sign.") + raise EmailSyntaxAtSignConfusedError('"full-width" at-sign (@)') # Check another near-homoglyph for good measure because # homoglyphs in place of required characters could be # very confusing. We may want to consider checking for # homoglyphs anywhere we look for a special symbol. if "﹫" in text: - raise EmailSyntaxError('The email address has the "small commercial at" character instead of a regular at-sign.') + raise EmailSyntaxAtSignConfusedError('"small commercial at"') - raise EmailSyntaxError("An email address must have an @-sign.") + raise EmailSyntaxNoAtSignError() # The right part is whatever is left. right_part = text[len(left_part):] @@ -129,8 +138,7 @@ def unquote_quoted_string(text: str) -> Tuple[str, bool]: escaped = True elif c == '"': if i != len(text) - 1: - raise EmailSyntaxError("Extra character(s) found after close quote: " - + ", ".join(safe_character_display(c) for c in text[i + 1:])) + raise EmailInvalidCharactersAfterQuotedString(list(text[i + 1:])) break else: value += c @@ -158,13 +166,7 @@ def unquote_quoted_string(text: str) -> Tuple[str, bool]: # Check that only basic characters are present in a # non-quoted display name. if not display_name_quoted: - bad_chars = { - safe_character_display(c) - for c in display_name - if (not ATEXT_RE.match(c) and c != ' ') or c == '.' - } - if bad_chars: - raise EmailSyntaxError("The display name contains invalid characters when not quoted: " + ", ".join(sorted(bad_chars)) + ".") + check_display_name_unquoted(display_name) # Check for other unsafe characters. check_unsafe_chars(display_name, allow_space=True) @@ -172,10 +174,10 @@ def unquote_quoted_string(text: str) -> Tuple[str, bool]: # Check that the right part ends with an angle bracket # but allow spaces after it, I guess. if ">" not in right_part: - raise EmailSyntaxError("An open angle bracket at the start of the email address has to be followed by a close angle bracket at the end.") + raise EmailBracketedAddressMissingCloseBracket() right_part = right_part.rstrip(" ") if right_part[-1] != ">": - raise EmailSyntaxError("There can't be anything after the email address.") + raise EmailBracketedAddressExtraneousText() # Remove the initial and trailing angle brackets. addr_spec = right_part[1:].rstrip(">") @@ -198,6 +200,15 @@ def unquote_quoted_string(text: str) -> Tuple[str, bool]: return display_name, local_part, domain_part, is_quoted_local_part +def check_display_name_unquoted(display_name): + bad_chars = sorted({ + c for c in display_name + if (not ATEXT_RE.match(c) and c != ' ') or c == '.' + }) + if bad_chars: + raise EmailInvalidCharactersInUnquotedDisplayName(bad_chars) + + def get_length_reason(addr: str, limit: int) -> str: """Helper function to return an error message related to invalid length.""" diff = len(addr) - limit @@ -205,23 +216,6 @@ def get_length_reason(addr: str, limit: int) -> str: return f"({diff} character{suffix} too many)" -def safe_character_display(c: str) -> str: - # Return safely displayable characters in quotes. - if c == '\\': - return f"\"{c}\"" # can't use repr because it escapes it - if unicodedata.category(c)[0] in ("L", "N", "P", "S"): - return repr(c) - - # Construct a hex string in case the unicode name doesn't exist. - if ord(c) < 0xFFFF: - h = f"U+{ord(c):04x}".upper() - else: - h = f"U+{ord(c):08x}".upper() - - # Return the character name or, if it has no name, the hex string. - return unicodedata.name(c, h) - - class LocalPartValidationResult(TypedDict): local_part: str ascii_local_part: Optional[str] @@ -234,7 +228,7 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp if len(local) == 0: if not allow_empty_local: - raise EmailSyntaxError("There must be something before the @-sign.") + raise EmailNoLocalPartError() # The caller allows an empty local part. Useful for validating certain # Postfix aliases. @@ -287,16 +281,15 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp # Check for invalid characters against the non-internationalized # permitted character set. # (RFC 5322 3.2.3) - bad_chars = { - safe_character_display(c) - for c in local + bad_chars = sorted({ + c for c in local if not ATEXT_RE.match(c) - } + }) if bad_chars: - raise EmailSyntaxError("Internationalized characters before the @-sign are not supported: " + ", ".join(sorted(bad_chars)) + ".") + raise EmailIntlCharactersInLocalPart(bad_chars) # Although the check above should always find something, fall back to this just in case. - raise EmailSyntaxError("Internationalized characters before the @-sign are not supported.") + raise EmailUnhandledSyntaxError("Internationalized characters before the @-sign are not supported.") # It's valid. valid = "dot-atom" @@ -311,18 +304,16 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp # (RFC 5321 4.1.2. RFC 5322 lists additional permitted *obsolete* # characters which are *not* allowed here. RFC 6531 section 3.3 # extends the range to UTF8 strings.) - bad_chars = { - safe_character_display(c) - for c in local + bad_chars = sorted({ + c for c in local if not QTEXT_INTL.match(c) - } + }) if bad_chars: - raise EmailSyntaxError("The email address contains invalid characters in quotes before the @-sign: " + ", ".join(sorted(bad_chars)) + ".") + raise EmailInvalidCharactersInLocalPart(bad_chars) # See if any characters are outside of the ASCII range. bad_chars = { - safe_character_display(c) - for c in local + c for c in local if not (32 <= ord(c) <= 126) } if bad_chars: @@ -330,7 +321,7 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp # International characters in the local part may not be permitted. if not allow_smtputf8: - raise EmailSyntaxError("Internationalized characters before the @-sign are not supported: " + ", ".join(sorted(bad_chars)) + ".") + raise EmailIntlCharactersInLocalPart(bad_chars) # It's valid. valid = "quoted" @@ -351,7 +342,7 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp try: local.encode("utf8") except ValueError as e: - raise EmailSyntaxError("The email address contains an invalid character.") from e + raise EmailUnhandledSyntaxError("The email address contains an invalid character.") from e # If this address passes only by the quoted string form, re-quote it # and backslash-escape quotes and backslashes (removing any unnecessary @@ -373,12 +364,11 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp # Check for invalid characters. # (RFC 5322 3.2.3, plus RFC 6531 3.3) bad_chars = { - safe_character_display(c) - for c in local + c for c in local if not ATEXT_INTL_DOT_RE.match(c) } if bad_chars: - raise EmailSyntaxError("The email address contains invalid characters before the @-sign: " + ", ".join(sorted(bad_chars)) + ".") + raise EmailInvalidCharactersInLocalPart(bad_chars) # Check for dot errors imposted by the dot-atom rule. # (RFC 5322 3.2.3) @@ -386,7 +376,7 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp # All of the reasons should already have been checked, but just in case # we have a fallback message. - raise EmailSyntaxError("The email address contains invalid characters before the @-sign.") + raise EmailUnhandledSyntaxError("The email address contains invalid characters before the @-sign.") def check_unsafe_chars(s: str, allow_space: bool = False) -> None: @@ -437,8 +427,7 @@ def check_unsafe_chars(s: str, allow_space: bool = False) -> None: # to the Unicode specification in the future, reject all other categories. bad_chars.add(c) if bad_chars: - raise EmailSyntaxError("The email address contains unsafe characters: " - + ", ".join(safe_character_display(c) for c in sorted(bad_chars)) + ".") + raise EmailUnsafeCharactersError(sorted(bad_chars)) def check_dot_atom(label: str, start_descr: str, end_descr: str, is_hostname: bool) -> None: @@ -460,6 +449,36 @@ def check_dot_atom(label: str, start_descr: str, end_descr: str, is_hostname: bo raise EmailSyntaxError("An email address cannot have a period and a hyphen next to each other.") +def uts46_valid_char(c): + # By exhaustively searching for characters rejected by + # for c in (chr(i) for i in range(0x110000)): + # idna.uts46_remap(c, std3_rules=False, transitional=False) + # I found the following rules are pretty close. + c = ord(c) + if 0x80 <= c <= 0x9f: + # 8-bit ASCII range. + return False + elif (0x2010 <= c <= 0x2060 and not (0x2024 <= c <= 0x2026) and not (0x2028 <= c <= 0x202E)) \ + or c in (0x00AD, 0x2064, 0xFF0E) \ + or 0x200B <= c <= 0x200D \ + or 0x1BCA0 <= c <= 0x1BCA3: + # Characters that are permitted but fall into one of the + # tests below. + return True + elif unicodedata.category(chr(c)) in ("Cf", "Cn", "Co", "Cs", "Zs", "Zl", "Zp"): + # There are a bunch of Zs characters including regular space + # that are allowed by UTS46 but are not allowed in domain + # names anyway. + # + # There are some Cn (unassigned) characters that the idna + # package doesn't reject but we can, I think. + return False + elif "002E" in unicodedata.decomposition(chr(c)).split(" "): + # Characters that decompose into a sequence with a dot. + return False + return True + + class DomainNameValidationResult(TypedDict): ascii_domain: str domain: str @@ -470,13 +489,12 @@ def validate_email_domain_name(domain: str, test_environment: bool = False, glob # Check for invalid characters. # (RFC 952 plus RFC 6531 section 3.3 for internationalized addresses) - bad_chars = { - safe_character_display(c) - for c in domain + bad_chars = sorted({ + c for c in domain if not ATEXT_HOSTNAME_INTL.match(c) - } + }) if bad_chars: - raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".") + raise EmailInvalidCharactersInDomainPart(bad_chars) # Check for unsafe characters. # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked @@ -484,6 +502,15 @@ def validate_email_domain_name(domain: str, test_environment: bool = False, glob # they may not be valid, safe, or sensible Unicode strings. check_unsafe_chars(domain) + # Reject characters that would be rejected by UTS-46 normalization next but + # with an error message under our control. + bad_chars = sorted({ + c for c in domain + if not uts46_valid_char(c) + }) + if bad_chars: + raise EmailInvalidCharactersInDomainPart(bad_chars) + # Perform UTS-46 normalization, which includes casefolding, NFC normalization, # and converting all label separators (the period/full stop, fullwidth full stop, # ideographic full stop, and halfwidth ideographic full stop) to regular dots. @@ -500,13 +527,12 @@ def validate_email_domain_name(domain: str, test_environment: bool = False, glob # Check for invalid characters after Unicode normalization which are not caught # by uts46_remap (see tests for examples). - bad_chars = { - safe_character_display(c) - for c in domain + bad_chars = sorted({ + c for c in domain if not ATEXT_HOSTNAME_INTL.match(c) - } + }) if bad_chars: - raise EmailSyntaxError("The part after the @-sign contains invalid characters after Unicode normalization: " + ", ".join(sorted(bad_chars)) + ".") + raise EmailInvalidCharactersInDomainPartAfterUnicodeNormalization(bad_chars) # The domain part is made up dot-separated "labels." Each label must # have at least one character and cannot start or end with dashes, which @@ -627,13 +653,12 @@ def validate_email_domain_name(domain: str, test_environment: bool = False, glob # Check for invalid and unsafe characters. We have no test # case for this. - bad_chars = { - safe_character_display(c) - for c in domain_i18n + bad_chars = sorted({ + c for c in domain_i18n if not ATEXT_HOSTNAME_INTL.match(c) - } + }) if bad_chars: - raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".") + raise EmailInvalidCharactersInDomainPartAfterUnicodeNormalization(bad_chars) check_unsafe_chars(domain_i18n) # Check that it can be encoded back to IDNA ASCII. We have no test @@ -770,13 +795,12 @@ def validate_email_domain_literal(domain_literal: str) -> DomainLiteralValidatio # Check for permitted ASCII characters. This actually doesn't matter # since there will be an exception after anyway. - bad_chars = { - safe_character_display(c) - for c in domain_literal + bad_chars = sorted({ + c for c in domain_literal if not DOMAIN_LITERAL_CHARS.match(c) - } + }) if bad_chars: - raise EmailSyntaxError("The part after the @-sign contains invalid characters in brackets: " + ", ".join(sorted(bad_chars)) + ".") + raise EmailInvalidCharactersInDomainAddressLiteral(bad_chars) # There are no other domain literal tags. # https://www.iana.org/assignments/address-literal-tags/address-literal-tags.xhtml diff --git a/tests/test_syntax.py b/tests/test_syntax.py index 853cc5e..74ed2f3 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -402,9 +402,7 @@ def test_domain_literal() -> None: ('.leadingdot@domain.com', 'An email address cannot start with a period.'), ('twodots..here@domain.com', 'An email address cannot have two periods in a row.'), ('trailingdot.@domain.email', 'An email address cannot have a period immediately before the @-sign.'), - ('me@⒈wouldbeinvalid.com', - "The part after the @-sign contains invalid characters (Codepoint U+2488 not allowed " - "at position 1 in '⒈wouldbeinvalid.com')."), + ('me@⒈wouldbeinvalid.com', "The part after the @-sign contains invalid characters: '⒈'."), ('me@\u037e.com', "The part after the @-sign contains invalid characters after Unicode normalization: ';'."), ('me@\u1fef.com', "The part after the @-sign contains invalid characters after Unicode normalization: '`'."), ('@example.com', 'There must be something before the @-sign.'),