Email: recognize forwarded message dividers

`_ORIG_RE` (and its JS mirror `_TALON_ORIG_RE`) already recognised the
Japanese forward marker `転送` alongside the "Original Message" delimiters,
but not the English "Forwarded message" one. So Gmail-style forwards —
including the ones Odysseus itself emits (`---------- Forwarded message
----------`, static/js/emailInbox.js) — were not treated as a quote
boundary:

  - with a following Outlook From:/Date: header block, the divider line
    leaked into the level-0 reply bubble as noise;
  - with only the divider marking the forward (no header block), the body
    was not split into turns at all.

Add `Forwarded\s+message` to the same `[-_=]{3,}`-delimited alternation in
both the server-side parser and the JS mirror, so forward dividers are
consumed as an attribution boundary like "----- Original Message -----".
Locale variants of "Forwarded message" can follow the existing pattern.

Tests cover both manifestations plus a negative control (the bare words
"forwarded message" without `[-_=]{3,}` delimiters must not split).

Checks: python -m pytest tests/test_forwarded_message_divider.py (3 passed),
python -m py_compile src/email_thread_parser.py, node --check
static/js/emailLibrary/utils.js, git diff --check.
This commit is contained in:
mist
2026-06-02 14:32:56 +03:00
committed by GitHub
parent 6ea8fec896
commit 0b0be3c339
3 changed files with 60 additions and 2 deletions

View File

@@ -57,7 +57,8 @@ _CCBCC = r"(?:Cc|Bcc|Kopie|Skrytá kopie|Копия)"
_HDR_KEYS = rf"(?:{_FROM}|{_SENT}|{_SUBJ}|{_TO}|{_CCBCC}|Importance|Priority)" _HDR_KEYS = rf"(?:{_FROM}|{_SENT}|{_SUBJ}|{_TO}|{_CCBCC}|Importance|Priority)"
_ORIG_RE = re.compile( _ORIG_RE = re.compile(
r"(?:^|\n)[\s>]*[-_=]{3,}\s*(?:Original\s+Message|Ursprüngliche\s+Nachricht|" r"(?:^|\n)[\s>]*[-_=]{3,}\s*(?:Original\s+Message|Forwarded\s+message|"
r"Ursprüngliche\s+Nachricht|"
r"Mensaje\s+original|Messaggio\s+originale|Message\s+d[']origine|" r"Mensaje\s+original|Messaggio\s+originale|Message\s+d[']origine|"
r"Oorspronkelijk\s+bericht|Original\s+meddelande|原文|原始邮件|転送)" r"Oorspronkelijk\s+bericht|Original\s+meddelande|原文|原始邮件|転送)"
r"\s*[-_=]{3,}", r"\s*[-_=]{3,}",

View File

@@ -15,7 +15,7 @@ export const _TALON_FROM = '(?:From|Från|Von|De|Da|От|Od|Van|差出人|发件
export const _TALON_SENT = '(?:Sent|Skickat|Gesendet|Envoy[ée]|Inviato|Enviado|Verzonden|Отправлено|Wysłane|Date|送信日時|发送时间|寄件日期|Sendt|Lähetetty|Tarih|Datum|Data|Datum)'; export const _TALON_SENT = '(?:Sent|Skickat|Gesendet|Envoy[ée]|Inviato|Enviado|Verzonden|Отправлено|Wysłane|Date|送信日時|发送时间|寄件日期|Sendt|Lähetetty|Tarih|Datum|Data|Datum)';
export const _TALON_SUBJ = '(?:Subject|Ämne|Betreff|Objet|Oggetto|Asunto|Onderwerp|Тема|Temat|件名|主题|主旨|Emne|Aihe|Onderwerp|Konu)'; export const _TALON_SUBJ = '(?:Subject|Ämne|Betreff|Objet|Oggetto|Asunto|Onderwerp|Тема|Temat|件名|主题|主旨|Emne|Aihe|Onderwerp|Konu)';
export const _TALON_TO = '(?:To|Till|An|À|A|Voor|Para|Naar|Кому|Do|宛先|收件人|Emri|Komu)'; export const _TALON_TO = '(?:To|Till|An|À|A|Voor|Para|Naar|Кому|Do|宛先|收件人|Emri|Komu)';
export const _TALON_ORIG_RE = /(?:^|\n)[\s>]*[-_=]{3,}\s*(?:Original\s+Message|Ursprüngliche\s+Nachricht|Mensaje\s+original|Messaggio\s+originale|Message\s+d[']origine|Oorspronkelijk\s+bericht|Original\s+meddelande|Vor[ ]asal[a]\s+meddelande|原文|原始邮件|転送)\s*[-_=]{3,}/i; export const _TALON_ORIG_RE = /(?:^|\n)[\s>]*[-_=]{3,}\s*(?:Original\s+Message|Forwarded\s+message|Ursprüngliche\s+Nachricht|Mensaje\s+original|Messaggio\s+originale|Message\s+d[']origine|Oorspronkelijk\s+bericht|Original\s+meddelande|Vor[ ]asal[a]\s+meddelande|原文|原始邮件|転送)\s*[-_=]{3,}/i;
// Minimum plain-text length of a "signature" before we bother folding it. // Minimum plain-text length of a "signature" before we bother folding it.
// Short closings ("Cheers, John") stay inline — folding them would add // Short closings ("Cheers, John") stay inline — folding them would add

View File

@@ -0,0 +1,57 @@
"""The thread parser must treat the Gmail-style "---------- Forwarded message
---------" divider as a quote boundary, like "----- Original Message -----".
`_ORIG_RE` already recognised the Japanese forward marker (転送) but not the
English "Forwarded message" one, so forwarded mail produced by Odysseus itself
(static/js/emailInbox.js emits exactly `---------- Forwarded message ----------`)
leaked the divider into the level-0 reply bubble — or, with no Outlook header
block to fall back on, was not split into turns at all.
"""
from src.email_thread_parser import parse_thread
def test_forwarded_divider_not_leaked_into_reply_body():
text = (
"See below.\n\n"
"---------- Forwarded message ---------\n"
"From: Alice <alice@example.com>\n"
"Date: Thu, May 7, 2026 at 11:33 AM\n"
"Subject: Original subject\n"
"To: Bob <bob@x.com>\n\n"
"Forwarded body content.\n"
)
turns = parse_thread(None, text)
assert turns is not None
# The reply turn must be clean — the divider is noise, not reply content.
assert turns[0]["level"] == 0
assert "Forwarded message" not in turns[0]["body_html"]
# No turn at all should carry the raw divider in its rendered body.
assert all("Forwarded message" not in t["body_html"] for t in turns)
# The forwarded content becomes a deeper turn with sender meta.
deeper = [t for t in turns if t["level"] >= 1]
assert deeper, "forwarded body should split into a deeper turn"
assert "alice@example.com" in (deeper[0]["meta"] or "")
assert "Forwarded body content." in deeper[0]["body_html"]
def test_forwarded_divider_alone_triggers_split():
# No Outlook header block — only the divider marks the forward. Before the
# fix this returned None (no split), folding the forward into the reply.
text = (
"See the message below.\n\n"
"---------- Forwarded message ----------\n"
"Forwarded body with no header block.\n"
)
turns = parse_thread(None, text)
assert turns is not None
assert any(t["level"] >= 1 for t in turns)
assert all("Forwarded message" not in t["body_html"] for t in turns)
def test_forwarded_words_without_delimiters_do_not_split():
# Negative control: the bare words "forwarded message" in normal prose,
# with no [-_=]{3,} delimiters, must NOT be treated as a divider.
text = "I forwarded message after message to the team but heard nothing back."
assert parse_thread(None, text) is None