From 0b0be3c3395cb34c39b65b2e02e326893bae9df9 Mon Sep 17 00:00:00 2001 From: mist Date: Tue, 2 Jun 2026 14:32:56 +0300 Subject: [PATCH] Email: recognize forwarded message dividers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `_ORIG_RE` (and its JS mirror `_TALON_ORIG_RE`) already recognised the Japanese forward marker `転送` alongside the "Original Message" delimiters, but not the English "Forwarded message" one. So Gmail-style forwards — including the ones Odysseus itself emits (`---------- Forwarded message ----------`, static/js/emailInbox.js) — were not treated as a quote boundary: - with a following Outlook From:/Date: header block, the divider line leaked into the level-0 reply bubble as noise; - with only the divider marking the forward (no header block), the body was not split into turns at all. Add `Forwarded\s+message` to the same `[-_=]{3,}`-delimited alternation in both the server-side parser and the JS mirror, so forward dividers are consumed as an attribution boundary like "----- Original Message -----". Locale variants of "Forwarded message" can follow the existing pattern. Tests cover both manifestations plus a negative control (the bare words "forwarded message" without `[-_=]{3,}` delimiters must not split). Checks: python -m pytest tests/test_forwarded_message_divider.py (3 passed), python -m py_compile src/email_thread_parser.py, node --check static/js/emailLibrary/utils.js, git diff --check. --- src/email_thread_parser.py | 3 +- static/js/emailLibrary/utils.js | 2 +- tests/test_forwarded_message_divider.py | 57 +++++++++++++++++++++++++ 3 files changed, 60 insertions(+), 2 deletions(-) create mode 100644 tests/test_forwarded_message_divider.py diff --git a/src/email_thread_parser.py b/src/email_thread_parser.py index 913847d..5547e37 100644 --- a/src/email_thread_parser.py +++ b/src/email_thread_parser.py @@ -57,7 +57,8 @@ _CCBCC = r"(?:Cc|Bcc|Kopie|Skrytá kopie|Копия)" _HDR_KEYS = rf"(?:{_FROM}|{_SENT}|{_SUBJ}|{_TO}|{_CCBCC}|Importance|Priority)" _ORIG_RE = re.compile( - r"(?:^|\n)[\s>]*[-_=]{3,}\s*(?:Original\s+Message|Ursprüngliche\s+Nachricht|" + r"(?:^|\n)[\s>]*[-_=]{3,}\s*(?:Original\s+Message|Forwarded\s+message|" + r"Ursprüngliche\s+Nachricht|" r"Mensaje\s+original|Messaggio\s+originale|Message\s+d['’]origine|" r"Oorspronkelijk\s+bericht|Original\s+meddelande|原文|原始邮件|転送)" r"\s*[-_=]{3,}", diff --git a/static/js/emailLibrary/utils.js b/static/js/emailLibrary/utils.js index f74541c..e4dc898 100644 --- a/static/js/emailLibrary/utils.js +++ b/static/js/emailLibrary/utils.js @@ -15,7 +15,7 @@ export const _TALON_FROM = '(?:From|Från|Von|De|Da|От|Od|Van|差出人|发件 export const _TALON_SENT = '(?:Sent|Skickat|Gesendet|Envoy[ée]|Inviato|Enviado|Verzonden|Отправлено|Wysłane|Date|送信日時|发送时间|寄件日期|Sendt|Lähetetty|Tarih|Datum|Data|Datum)'; export const _TALON_SUBJ = '(?:Subject|Ämne|Betreff|Objet|Oggetto|Asunto|Onderwerp|Тема|Temat|件名|主题|主旨|Emne|Aihe|Onderwerp|Konu)'; export const _TALON_TO = '(?:To|Till|An|À|A|Voor|Para|Naar|Кому|Do|宛先|收件人|Emri|Komu)'; -export const _TALON_ORIG_RE = /(?:^|\n)[\s>]*[-_=]{3,}\s*(?:Original\s+Message|Ursprüngliche\s+Nachricht|Mensaje\s+original|Messaggio\s+originale|Message\s+d['’]origine|Oorspronkelijk\s+bericht|Original\s+meddelande|Vor[ ]asal[a]\s+meddelande|原文|原始邮件|転送)\s*[-_=]{3,}/i; +export const _TALON_ORIG_RE = /(?:^|\n)[\s>]*[-_=]{3,}\s*(?:Original\s+Message|Forwarded\s+message|Ursprüngliche\s+Nachricht|Mensaje\s+original|Messaggio\s+originale|Message\s+d['’]origine|Oorspronkelijk\s+bericht|Original\s+meddelande|Vor[ ]asal[a]\s+meddelande|原文|原始邮件|転送)\s*[-_=]{3,}/i; // Minimum plain-text length of a "signature" before we bother folding it. // Short closings ("Cheers, John") stay inline — folding them would add diff --git a/tests/test_forwarded_message_divider.py b/tests/test_forwarded_message_divider.py new file mode 100644 index 0000000..3fc710d --- /dev/null +++ b/tests/test_forwarded_message_divider.py @@ -0,0 +1,57 @@ +"""The thread parser must treat the Gmail-style "---------- Forwarded message +---------" divider as a quote boundary, like "----- Original Message -----". + +`_ORIG_RE` already recognised the Japanese forward marker (転送) but not the +English "Forwarded message" one, so forwarded mail produced by Odysseus itself +(static/js/emailInbox.js emits exactly `---------- Forwarded message ----------`) +leaked the divider into the level-0 reply bubble — or, with no Outlook header +block to fall back on, was not split into turns at all. +""" +from src.email_thread_parser import parse_thread + + +def test_forwarded_divider_not_leaked_into_reply_body(): + text = ( + "See below.\n\n" + "---------- Forwarded message ---------\n" + "From: Alice \n" + "Date: Thu, May 7, 2026 at 11:33 AM\n" + "Subject: Original subject\n" + "To: Bob \n\n" + "Forwarded body content.\n" + ) + turns = parse_thread(None, text) + assert turns is not None + + # The reply turn must be clean — the divider is noise, not reply content. + assert turns[0]["level"] == 0 + assert "Forwarded message" not in turns[0]["body_html"] + # No turn at all should carry the raw divider in its rendered body. + assert all("Forwarded message" not in t["body_html"] for t in turns) + + # The forwarded content becomes a deeper turn with sender meta. + deeper = [t for t in turns if t["level"] >= 1] + assert deeper, "forwarded body should split into a deeper turn" + assert "alice@example.com" in (deeper[0]["meta"] or "") + assert "Forwarded body content." in deeper[0]["body_html"] + + +def test_forwarded_divider_alone_triggers_split(): + # No Outlook header block — only the divider marks the forward. Before the + # fix this returned None (no split), folding the forward into the reply. + text = ( + "See the message below.\n\n" + "---------- Forwarded message ----------\n" + "Forwarded body with no header block.\n" + ) + turns = parse_thread(None, text) + assert turns is not None + assert any(t["level"] >= 1 for t in turns) + assert all("Forwarded message" not in t["body_html"] for t in turns) + + +def test_forwarded_words_without_delimiters_do_not_split(): + # Negative control: the bare words "forwarded message" in normal prose, + # with no [-_=]{3,} delimiters, must NOT be treated as a divider. + text = "I forwarded message after message to the team but heard nothing back." + assert parse_thread(None, text) is None