From 8abd6fc6a7946671902857327da5d296fa1b8587 Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Wed, 16 Aug 2023 18:37:53 +0200 Subject: [PATCH] gh-79986: Add parsing for References/In-Reply-To email headers This is a followup to 46d88a113142b26c01c95c93846a89318ba87ffc (#13397), which added parsing for Message-ID. Similar handling is needed for the other two identification headers. --- Lib/email/_header_value_parser.py | 32 ++++++++ Lib/email/headerregistry.py | 14 ++++ .../test_email/test__header_value_parser.py | 75 +++++++++++++++++++ Lib/test/test_email/test_headerregistry.py | 13 ++++ ...5-07-29-11-37-22.gh-issue-79986.fnJbE_.rst | 3 + 5 files changed, 137 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2025-07-29-11-37-22.gh-issue-79986.fnJbE_.rst diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index cbff9694742490..46fef2048babe7 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -878,6 +878,12 @@ class MessageID(MsgID): class InvalidMessageID(MessageID): token_type = 'invalid-message-id' +class MessageIDList(TokenList): + token_type = 'message-id-list' + + @property + def message_ids(self): + return [x for x in self if x.token_type=='msg-id'] class Header(TokenList): token_type = 'header' @@ -2175,6 +2181,32 @@ def parse_message_id(value): return message_id +def parse_message_ids(value): + """in-reply-to = "In-Reply-To:" 1*msg-id CRLF + references = "References:" 1*msg-id CRLF + """ + message_id_list = MessageIDList() + while value: + if value[0] == ',': + # message id list separated with commas - this is invalid, + # but happens rather frequently in the wild + message_id_list.defects.append( + errors.InvalidHeaderDefect("comma in msg-id list")) + message_id_list.append( + WhiteSpaceTerminal(' ', 'invalid-comma-replacement')) + value = value[1:] + continue + try: + token, value = get_msg_id(value) + message_id_list.append(token) + except errors.HeaderParseError as ex: + token = get_unstructured(value) + message_id_list.append(InvalidMessageID(token)) + message_id_list.defects.append( + errors.InvalidHeaderDefect("Invalid msg-id: {!r}".format(ex))) + break + return message_id_list + # # XXX: As I begin to add additional header parsers, I'm realizing we probably # have two level of parser routines: the get_XXX methods that get a token in diff --git a/Lib/email/headerregistry.py b/Lib/email/headerregistry.py index 543141dc427ebe..0e8698efc0b966 100644 --- a/Lib/email/headerregistry.py +++ b/Lib/email/headerregistry.py @@ -534,6 +534,18 @@ def parse(cls, value, kwds): kwds['defects'].extend(parse_tree.all_defects) +class ReferencesHeader: + + max_count = 1 + value_parser = staticmethod(parser.parse_message_ids) + + @classmethod + def parse(cls, value, kwds): + kwds['parse_tree'] = parse_tree = cls.value_parser(value) + kwds['decoded'] = str(parse_tree) + kwds['defects'].extend(parse_tree.all_defects) + + # The header factory # _default_header_map = { @@ -557,6 +569,8 @@ def parse(cls, value, kwds): 'content-disposition': ContentDispositionHeader, 'content-transfer-encoding': ContentTransferEncodingHeader, 'message-id': MessageIDHeader, + 'in-reply-to': ReferencesHeader, + 'references': ReferencesHeader, } class HeaderRegistry: diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index f7f9f9c4e2fbb5..f33844910beee4 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2867,6 +2867,81 @@ def test_get_msg_id_ws_only_local(self): ) self.assertEqual(msg_id.token_type, 'msg-id') + def test_parse_message_ids_valid(self): + message_ids = self._test_parse_x( + parser.parse_message_ids, + " ", + " ", + " ", + [], + ) + self.assertEqual(message_ids.token_type, 'message-id-list') + + def test_parse_message_ids_empty(self): + message_ids = self._test_parse_x( + parser.parse_message_ids, + " ", + " ", + " ", + [errors.InvalidHeaderDefect], + ) + self.assertEqual(message_ids.token_type, 'message-id-list') + + def test_parse_message_ids_comment(self): + message_ids = self._test_parse_x( + parser.parse_message_ids, + " (foo's message from \"bar\")", + " (foo's message from \"bar\")", + " ", + [], + ) + self.assertEqual(message_ids.message_ids[0].value, ' ') + self.assertEqual(message_ids.token_type, 'message-id-list') + + def test_parse_message_ids_no_sep(self): + message_ids = self._test_parse_x( + parser.parse_message_ids, + "", + "", + "", + [], + ) + self.assertEqual(message_ids.message_ids[0].value, '') + self.assertEqual(message_ids.message_ids[1].value, '') + self.assertEqual(message_ids.token_type, 'message-id-list') + + def test_parse_message_ids_comma_sep(self): + message_ids = self._test_parse_x( + parser.parse_message_ids, + ",", + " ", + " ", + [errors.InvalidHeaderDefect], + ) + self.assertEqual(message_ids.message_ids[0].value, '') + self.assertEqual(message_ids.message_ids[1].value, '') + self.assertEqual(message_ids.token_type, 'message-id-list') + + def test_parse_message_ids_invalid_id(self): + message_ids = self._test_parse_x( + parser.parse_message_ids, + "", + "", + "", + [errors.InvalidHeaderDefect]*2, + ) + self.assertEqual(message_ids.token_type, 'message-id-list') + + def test_parse_message_ids_broken_ang(self): + message_ids = self._test_parse_x( + parser.parse_message_ids, + " >bar@foo", + " >bar@foo", + " >bar@foo", + [errors.InvalidHeaderDefect]*1, + ) + self.assertEqual(message_ids.token_type, 'message-id-list') + @parameterize diff --git a/Lib/test/test_email/test_headerregistry.py b/Lib/test/test_email/test_headerregistry.py index 7138aa4c556d1f..95c6afbee41ef5 100644 --- a/Lib/test/test_email/test_headerregistry.py +++ b/Lib/test/test_email/test_headerregistry.py @@ -1821,5 +1821,18 @@ def test_message_id_header_is_not_folded(self): h.fold(policy=policy.default.clone(max_line_length=20)), 'Message-ID:\n <ईमेलfromMessage@wők.com>\n') + def test_fold_references(self): + h = self.make_header( + 'References', + ' ' + '' + ) + self.assertEqual( + h.fold(policy=policy.default.clone(max_line_length=20)), + 'References: ' + '\n' + ' \n') + + if __name__ == '__main__': unittest.main() diff --git a/Misc/NEWS.d/next/Library/2025-07-29-11-37-22.gh-issue-79986.fnJbE_.rst b/Misc/NEWS.d/next/Library/2025-07-29-11-37-22.gh-issue-79986.fnJbE_.rst new file mode 100644 index 00000000000000..327bbf869bce09 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-07-29-11-37-22.gh-issue-79986.fnJbE_.rst @@ -0,0 +1,3 @@ +Add parsing for ``References`` and ``In-Reply-To`` headers to the :mod:`email` +library that parses the header content as lists of message id tokens. This +prevents them from being folded incorrectly.