Odysseus v1.0
This commit is contained in:
613
src/email_thread_parser.py
Normal file
613
src/email_thread_parser.py
Normal file
@@ -0,0 +1,613 @@
|
||||
"""
|
||||
email_thread_parser.py
|
||||
|
||||
Server-side port of the JS thread parser in static/js/emailLibrary.js.
|
||||
Walks an email body (HTML or plain text) and returns a tree of reply turns
|
||||
that the client can render directly without re-parsing.
|
||||
|
||||
Mirrors the rules from talon (mailgun) and email-reply-parser:
|
||||
- Multilingual "On <date>, <name> wrote:" attribution lines (20+ locales)
|
||||
- Outlook-style "From: ... Sent: ... Subject:" header blocks
|
||||
- "----- Original Message -----" delimiters
|
||||
- <blockquote> nesting (HTML)
|
||||
- "> " prefix nesting (plain text)
|
||||
|
||||
Returns a list of dicts:
|
||||
[
|
||||
{"level": 0, "body_html": "...", "meta": null},
|
||||
{"level": 1, "body_html": "...", "meta": "Alice <a@x> · May 5"},
|
||||
{"level": 2, "body_html": "...", "meta": "Bob <b@y> · May 4"},
|
||||
...
|
||||
]
|
||||
where level 0 is the current reply, increasing levels = deeper in the chain.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import html as _html
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
# Bump whenever the parser's output shape or splitting rules change. The
|
||||
# cache layer wraps turns as {"v": THREAD_PARSER_VERSION, "turns": [...]}
|
||||
# and treats anything with a different version as stale.
|
||||
THREAD_PARSER_VERSION = 6
|
||||
|
||||
# ── Locale tables (same as static/js/emailLibrary.js _TALON_*) ──
|
||||
|
||||
_WROTE = (
|
||||
r"(?:wrote|écrit|escribió|scrisse|schrieb|skrev|schreef|napisał|написал|"
|
||||
r"napsal|написа|έγραψε|katselivat|napisao|написав|napisała|napisali|"
|
||||
r"hat geschrieben|kirjoitti|написала|escreveu)"
|
||||
)
|
||||
_FROM = (
|
||||
r"(?:From|Från|Von|De|Da|От|Od|Van|差出人|发件人|寄件人|Lähettäjä|"
|
||||
r"Avsender|Pošiljatelj|Frá)"
|
||||
)
|
||||
_SENT = (
|
||||
r"(?:Sent|Skickat|Gesendet|Envoy[ée]|Inviato|Enviado|Verzonden|Отправлено|"
|
||||
r"Wysłane|Date|送信日時|发送时间|寄件日期|Sendt|Lähetetty|Tarih|Datum|Data)"
|
||||
)
|
||||
_SUBJ = (
|
||||
r"(?:Subject|Ämne|Betreff|Objet|Oggetto|Asunto|Onderwerp|Тема|Temat|"
|
||||
r"件名|主题|主旨|Emne|Aihe|Konu)"
|
||||
)
|
||||
_TO = r"(?:To|Till|An|À|A|Voor|Para|Naar|Кому|Do|宛先|收件人|Komu)"
|
||||
_CCBCC = r"(?:Cc|Bcc|Kopie|Skrytá kopie|Копия)"
|
||||
_HDR_KEYS = rf"(?:{_FROM}|{_SENT}|{_SUBJ}|{_TO}|{_CCBCC}|Importance|Priority)"
|
||||
|
||||
_ORIG_RE = re.compile(
|
||||
r"(?:^|\n)[\s>]*[-_=]{3,}\s*(?:Original\s+Message|Ursprüngliche\s+Nachricht|"
|
||||
r"Mensaje\s+original|Messaggio\s+originale|Message\s+d['’]origine|"
|
||||
r"Oorspronkelijk\s+bericht|Original\s+meddelande|原文|原始邮件|転送)"
|
||||
r"\s*[-_=]{3,}",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
_WROTE_LINE_RE = re.compile(rf"^\s*On\s.+?\s{_WROTE}\s*:\s*$", re.IGNORECASE | re.MULTILINE)
|
||||
# CJK-style attribution lines — Japanese Gmail / Yahoo Mail JP / etc.
|
||||
# Examples (all valid):
|
||||
# 2026年5月11日(月) 21:28 <alice@example.com>:
|
||||
# 2026年5月11日 21:28 alice@example.com:
|
||||
# 2026/05/11 21:28 <alice@example.com> のメッセージ:
|
||||
# 2026年5月11日(月) 21:28に Alice Smith <alice@example.com> のメッセージ:
|
||||
# 2026年5月11日 21:28、alice@example.com さんは書きました:
|
||||
# Alice さんは 2026/05/11 21:28 に書きました:
|
||||
_CJK_ATTRIB_LINE_RE = re.compile(
|
||||
r"^\s*(?:"
|
||||
# date(weekday) time <email>: (Gmail JP default)
|
||||
r"\d{4}[年/.-]\d{1,2}[月/.-]\d{1,2}日?(?:\s*[\(\(].+?[\)\)])?"
|
||||
r"\s+\d{1,2}:\d{2}(?:\s*[APAP][MM])?"
|
||||
r"(?:に|、|,)?\s*(?:.+?\s+)?[<<]?[\w.+\-]+@[\w.\-]+\.[A-Za-z]{2,}[>>]?"
|
||||
r"\s*(?:のメッセージ|さんは(?:書|お?書き)きました|wrote)?\s*[::]\s*$"
|
||||
r"|"
|
||||
# 何々さんは 2026/05/11 21:28 に書きました:
|
||||
r".+?(?:さん|様)\s*(?:は|が)\s+\d{4}[年/.-]\d{1,2}[月/.-]\d{1,2}日?"
|
||||
r"(?:\s*[\(\(].+?[\)\)])?\s+\d{1,2}:\d{2}\s*(?:に)?\s*(?:書|お?書き)きました\s*[::]\s*$"
|
||||
r"|"
|
||||
# Chinese "XXX 写道:" preceded by a date or address
|
||||
r".+?\s*写道\s*[::]\s*$"
|
||||
r"|"
|
||||
# Korean "님이 작성:"
|
||||
r".+?\s*님이\s*작성(?:한\s*내용)?\s*[::]\s*$"
|
||||
r")",
|
||||
re.MULTILINE,
|
||||
)
|
||||
_OUTLOOK_HEADER_RE = re.compile(
|
||||
rf"{_FROM}\s*:\s*[^\n]+\s*\n\s*(?:.+\n)?{_SENT}\s*:\s*[^\n]+\s*\n",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
# Stop the From/Date captures at the next header key so they don't swallow
|
||||
# the whole header block when whitespace has been normalised.
|
||||
_FROM_STOP = rf"\s+(?:{_FROM}|{_SENT}|{_SUBJ}|{_TO}|{_CCBCC}|Importance|Priority)\s*:"
|
||||
_DATE_STOP = rf"\s+(?:{_FROM}|{_SUBJ}|{_TO}|{_CCBCC}|Importance|Priority)\s*:"
|
||||
_QUOTE_META_FROM = re.compile(
|
||||
rf"{_FROM}\s*:\s*(.+?)(?:(?={_FROM_STOP})|$)",
|
||||
re.IGNORECASE | re.DOTALL,
|
||||
)
|
||||
_QUOTE_META_DATE = re.compile(
|
||||
rf"{_SENT}\s*:\s*(.+?)(?:(?={_DATE_STOP})|$)",
|
||||
re.IGNORECASE | re.DOTALL,
|
||||
)
|
||||
# Greedy date capture so multi-comma dates ("Thu, May 7, 2026, 11:33 AM,")
|
||||
# don't collapse to just the day. We let the comma + lazy author match
|
||||
# back off to the LAST comma before "wrote:".
|
||||
_GMAIL_ATTRIB = re.compile(
|
||||
rf"On\s+(.+),\s+([^,]+?)\s+{_WROTE}\s*:",
|
||||
re.IGNORECASE | re.DOTALL,
|
||||
)
|
||||
|
||||
|
||||
def _extract_quote_meta(text_or_html: str) -> str | None:
|
||||
"""Pull a '<sender> · <date>' chip from a quoted block. Preserves
|
||||
angle-bracketed email addresses (`<foo@bar.com>`) so callers can
|
||||
identify the sender for chat-bubble alignment."""
|
||||
if not text_or_html:
|
||||
return None
|
||||
plain = re.sub(r"<style[\s\S]*?</style>", " ", text_or_html, flags=re.IGNORECASE)
|
||||
# Strip HTML tags, but keep <foo@bar> patterns since they carry the
|
||||
# sender's address that downstream consumers (bubble renderer) need.
|
||||
plain = re.sub(r"<(?![^@>\s]+@[^@>\s]+>)[^>]+>", " ", plain)
|
||||
plain = re.sub(r" ", " ", plain, flags=re.IGNORECASE)
|
||||
plain = plain.replace("&", "&").replace("<", "<").replace(">", ">").replace(""", '"')
|
||||
plain = re.sub(r"\s+", " ", plain).strip()[:1500]
|
||||
|
||||
f = _QUOTE_META_FROM.search(plain)
|
||||
d = _QUOTE_META_DATE.search(plain)
|
||||
if f and d:
|
||||
return f"{f.group(1).strip()} · {d.group(1).strip()[:80]}"
|
||||
g = _GMAIL_ATTRIB.search(plain)
|
||||
if g:
|
||||
date, who = g.group(1).strip(), g.group(2).strip()
|
||||
return f"{who} · {date}"
|
||||
# CJK attribution: "YYYY年MM月DD日(曜) HH:MM <email>:"
|
||||
cjk = re.search(
|
||||
r"(\d{4}[年/.-]\d{1,2}[月/.-]\d{1,2}日?(?:\s*[\(\(][^\)\)]+?[\)\)])?\s+\d{1,2}:\d{2}(?:\s*[APAP][MM])?)"
|
||||
r"\s*(?:に|、|,)?\s*"
|
||||
r"(?:(.+?)\s+)?" # optional display name
|
||||
r"[<<]?([\w.+\-]+@[\w.\-]+\.[A-Za-z]{2,})[>>]?",
|
||||
plain,
|
||||
)
|
||||
if cjk:
|
||||
date = cjk.group(1).strip()
|
||||
who = (cjk.group(2) or cjk.group(3) or '').strip()
|
||||
return f"{who} · {date}" if who else date
|
||||
if f:
|
||||
return f.group(1).strip()
|
||||
if d:
|
||||
return d.group(1).strip()
|
||||
return None
|
||||
|
||||
|
||||
# ── Plaintext path ──
|
||||
|
||||
# Outlook sometimes renders a one-line "conversation summary header" at
|
||||
# the very top of a reply when the recipient's mail client copies it from
|
||||
# the reading pane (whitespace gets squashed). Looks like:
|
||||
# "alice@example.comThursday, May 7, 2026 3:06 PM To: housekeeping <...> Subject: ..."
|
||||
# or just:
|
||||
# "alice@example.comThursday, May 7, 2026 3:06 PM"
|
||||
# Same info already lives in the envelope, so strip it.
|
||||
_MASHED_HDR_RE = re.compile(
|
||||
r"^\s*[\w.+\-]+@[\w.\-]+\.[A-Za-z]{2,}" # email address
|
||||
r"\s*"
|
||||
r"(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)[a-z]*,?\s+" # day name
|
||||
r"\S+\s+\d+,?\s*\d{4}\s+\d{1,2}:\d{2}" # date + time
|
||||
r"(?:\s*[AP]M)?" # optional AM/PM
|
||||
rf"(?:\s+{_TO}\s*:\s*[^\n]+(?:\s+{_SUBJ}\s*:\s*[^\n]*)?)?" # optional To:/Subject:
|
||||
r"\s*(?:\n|$)", # end of line
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def _strip_mashed_header(text: str) -> str:
|
||||
if not text:
|
||||
return text
|
||||
m = _MASHED_HDR_RE.match(text)
|
||||
if not m:
|
||||
return text
|
||||
rest = text[m.end():]
|
||||
# Skip any blank lines that immediately follow the strip.
|
||||
rest = re.sub(r"^\s*\n+", "", rest)
|
||||
return rest
|
||||
|
||||
|
||||
def _normalize_body(text: str) -> str:
|
||||
"""Strip noise that mail clients (mostly Outlook) inject into the
|
||||
plaintext body but that adds no signal — duplicate <mailto:> link
|
||||
decorations, bracketed-URL annotations, repeated blank lines, and
|
||||
the mashed conversation-header at the top."""
|
||||
if not text:
|
||||
return text
|
||||
text = _strip_mashed_header(text)
|
||||
# Outlook appends `<mailto:foo@bar>` after every email address it
|
||||
# finds, and `<https://...>` after every URL. Both are duplicate
|
||||
# noise — they show the same target as the visible text. Drop them.
|
||||
text = re.sub(r"<mailto:[^<>\s]*>", "", text, flags=re.IGNORECASE)
|
||||
text = re.sub(r"<https?://[^<>\s]*>", "", text, flags=re.IGNORECASE)
|
||||
# Trim trailing whitespace (incl. NBSP / form-feed / tab) so blank
|
||||
# lines that mail clients fill with non-breaking spaces still count
|
||||
# as blank for the collapse step below.
|
||||
text = re.sub(r"[^\S\n]+(\n|$)", r"\1", text)
|
||||
# Collapse 3+ consecutive newlines (vertical-space soup) into 2.
|
||||
text = re.sub(r"\n{3,}", "\n\n", text)
|
||||
return text
|
||||
|
||||
|
||||
def _outlook_header_block_end(stripped: list[str], levels: list[int], start: int) -> int:
|
||||
"""If lines[start..N] form an Outlook From/Sent/To/Subject header block
|
||||
at the same base level, return N (exclusive end). Otherwise return start.
|
||||
Requires a From: line followed within 5 lines by a Sent:/Date: line."""
|
||||
if start >= len(stripped):
|
||||
return start
|
||||
base = levels[start]
|
||||
first = stripped[start].strip()
|
||||
if not re.match(rf"^{_FROM}\s*:\s*\S", first, re.IGNORECASE):
|
||||
return start
|
||||
# Look ahead for the matching Sent:/Date: line at the same base level.
|
||||
found_sent = False
|
||||
j = start + 1
|
||||
while j < len(stripped) and j < start + 6 and levels[j] == base:
|
||||
nl = stripped[j].strip()
|
||||
if not nl:
|
||||
j += 1
|
||||
continue
|
||||
if re.match(rf"^{_SENT}\s*:", nl, re.IGNORECASE):
|
||||
found_sent = True
|
||||
break
|
||||
if not re.match(rf"^{_HDR_KEYS}\s*:", nl, re.IGNORECASE):
|
||||
return start # something other than a header key — abort
|
||||
j += 1
|
||||
if not found_sent:
|
||||
return start
|
||||
# Consume header-key lines until we hit a non-header line OR a blank line.
|
||||
j = start + 1
|
||||
while j < len(stripped) and levels[j] == base:
|
||||
nl = stripped[j].strip()
|
||||
if not nl:
|
||||
j += 1
|
||||
break
|
||||
if re.match(rf"^{_HDR_KEYS}\s*:", nl, re.IGNORECASE):
|
||||
j += 1
|
||||
continue
|
||||
break
|
||||
return j
|
||||
|
||||
|
||||
def _parse_plaintext(text: str) -> list[dict[str, Any]] | None:
|
||||
"""Walk `>` quote prefix levels + inline attribution markers at any
|
||||
level. Each attribution event AND each `>`-level increment counts as
|
||||
one conversation step, with one important exception: an attribution
|
||||
marker IMMEDIATELY followed by a deeper `>` block is the same event
|
||||
as that `>` increase (the classic Gmail "On X wrote:\\n> quoted"
|
||||
pattern) and contributes only one step.
|
||||
|
||||
Returns a flat list of {level, body_html, meta} or None when nothing
|
||||
quoted is detected."""
|
||||
if not text or len(text) > 200_000:
|
||||
return None
|
||||
text = _normalize_body(text)
|
||||
lines = text.splitlines()
|
||||
|
||||
base_levels: list[int] = []
|
||||
stripped_lines: list[str] = []
|
||||
for line in lines:
|
||||
m = re.match(r"^((?:>\s?)+)", line)
|
||||
n = line[: m.end()].count(">") if m else 0
|
||||
base_levels.append(n)
|
||||
stripped_lines.append(re.sub(r"^(?:>\s?)+", "", line) if n > 0 else line)
|
||||
|
||||
has_quotes = any(l > 0 for l in base_levels)
|
||||
has_attrib = bool(
|
||||
_WROTE_LINE_RE.search(text) or _ORIG_RE.search(text)
|
||||
or _OUTLOOK_HEADER_RE.search(text) or _CJK_ATTRIB_LINE_RE.search(text)
|
||||
)
|
||||
if not has_quotes and not has_attrib:
|
||||
return None
|
||||
|
||||
turns: list[dict[str, Any]] = []
|
||||
buf: list[str] = []
|
||||
cur_level = 0
|
||||
pending_meta: str | None = None
|
||||
# depth_at_base[B] = the effective conversation depth recorded the
|
||||
# last time we were at `>` base level B. Used to restore depth when
|
||||
# the > nesting decreases (we hop back to a shallower base).
|
||||
depth_at_base: dict[int, int] = {0: 0}
|
||||
depth = 0
|
||||
prev_base = 0
|
||||
|
||||
def lookahead_content_base(start_idx: int) -> int | None:
|
||||
j = start_idx
|
||||
while j < len(lines) and not stripped_lines[j].strip():
|
||||
j += 1
|
||||
return base_levels[j] if j < len(lines) else None
|
||||
|
||||
def flush() -> None:
|
||||
# `buf` is only mutated via .clear() / .append() in the enclosing
|
||||
# scope, never re-assigned, so it doesn't need `nonlocal`.
|
||||
nonlocal pending_meta
|
||||
if not buf:
|
||||
return
|
||||
body = "\n".join(buf).rstrip()
|
||||
if body or cur_level > 0:
|
||||
turns.append({
|
||||
"level": cur_level,
|
||||
"body_html": _escape_to_html(body),
|
||||
"meta": pending_meta,
|
||||
})
|
||||
buf.clear()
|
||||
pending_meta = None
|
||||
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
base = base_levels[i]
|
||||
stripped = stripped_lines[i]
|
||||
|
||||
# `>` base level change → flush current turn, then step depth.
|
||||
if base > prev_base:
|
||||
flush()
|
||||
for b in range(prev_base + 1, base + 1):
|
||||
depth += 1
|
||||
depth_at_base[b] = depth
|
||||
cur_level = depth
|
||||
elif base < prev_base:
|
||||
flush()
|
||||
depth = depth_at_base.get(base, base)
|
||||
for b in list(depth_at_base.keys()):
|
||||
if b > base:
|
||||
del depth_at_base[b]
|
||||
cur_level = depth
|
||||
prev_base = base
|
||||
|
||||
is_gmail = bool(re.match(rf"^\s*On\s.+?\s{_WROTE}\s*:\s*$", stripped, re.IGNORECASE))
|
||||
is_cjk = bool(_CJK_ATTRIB_LINE_RE.match(stripped))
|
||||
is_orig = bool(_ORIG_RE.search("\n" + stripped))
|
||||
outlook_end = _outlook_header_block_end(stripped_lines, base_levels, i)
|
||||
is_outlook = outlook_end > i
|
||||
|
||||
if is_gmail or is_cjk or is_orig or is_outlook:
|
||||
# Collect the full attribution text for meta extraction.
|
||||
attrib_end = outlook_end if is_outlook else (i + 1)
|
||||
meta_text = "\n".join(stripped_lines[i:attrib_end])
|
||||
|
||||
# "-----Original Message-----" is almost always immediately
|
||||
# followed by an Outlook From:/Sent: header — fold that into
|
||||
# the SAME attribution event so we don't double-bump.
|
||||
if is_orig:
|
||||
j = attrib_end
|
||||
while j < len(lines) and base_levels[j] == base and not stripped_lines[j].strip():
|
||||
j += 1
|
||||
if j < len(lines) and base_levels[j] == base:
|
||||
oe2 = _outlook_header_block_end(stripped_lines, base_levels, j)
|
||||
if oe2 > j:
|
||||
meta_text = meta_text + "\n" + "\n".join(stripped_lines[j:oe2])
|
||||
attrib_end = oe2
|
||||
|
||||
# If the next content line lives at a deeper > base, the
|
||||
# upcoming `>` increase will be the depth step — suppress
|
||||
# our own bump so we don't double up. Otherwise, this
|
||||
# attribution IS the step.
|
||||
next_base = lookahead_content_base(attrib_end)
|
||||
flush()
|
||||
if next_base is not None and next_base > base:
|
||||
pending_meta = _extract_quote_meta(meta_text) or meta_text.strip().splitlines()[0]
|
||||
else:
|
||||
depth += 1
|
||||
depth_at_base[base] = depth
|
||||
cur_level = depth
|
||||
pending_meta = _extract_quote_meta(meta_text) or meta_text.strip().splitlines()[0]
|
||||
i = attrib_end
|
||||
continue
|
||||
|
||||
buf.append(stripped)
|
||||
i += 1
|
||||
|
||||
flush()
|
||||
|
||||
if not turns or (len(turns) == 1 and turns[0]["level"] == 0):
|
||||
return None
|
||||
return turns
|
||||
|
||||
|
||||
def _escape_to_html(text: str) -> str:
|
||||
"""Conservative plaintext → HTML: escape, then linkify URLs and convert
|
||||
newlines to <br>."""
|
||||
if not text:
|
||||
return ""
|
||||
out = _html.escape(text)
|
||||
out = re.sub(
|
||||
r"(https?://[^\s<>\"]+)",
|
||||
lambda m: f'<a href="{m.group(1)}" target="_blank" rel="noopener">{m.group(1)}</a>',
|
||||
out,
|
||||
)
|
||||
return out.replace("\n", "<br>")
|
||||
|
||||
|
||||
# ── HTML path (BeautifulSoup) ──
|
||||
|
||||
def _is_quote_container(tag) -> bool:
|
||||
"""Return True if a BeautifulSoup tag is a known quote-container element.
|
||||
Covers Gmail (`gmail_quote`), Apple Mail (`type="cite"`), Yahoo
|
||||
(`yahoo_quoted`), Outlook (`divRplyFwdMsg`, `OutlookMessageHeader`,
|
||||
`gmail_attr` precedes a quote in some forwards), and the standard
|
||||
`<blockquote>`."""
|
||||
if tag is None:
|
||||
return False
|
||||
name = (getattr(tag, "name", None) or "").lower()
|
||||
if name == "blockquote":
|
||||
return True
|
||||
cls = " ".join(tag.get("class") or []).lower() if hasattr(tag, "get") else ""
|
||||
if "gmail_quote" in cls or "yahoo_quoted" in cls or "moz-cite-prefix" in cls:
|
||||
return True
|
||||
if "outlookmessageheader" in cls or "wordsection1" in cls:
|
||||
return True
|
||||
if (tag.get("id") if hasattr(tag, "get") else "") == "divRplyFwdMsg":
|
||||
return True
|
||||
typ = (tag.get("type") if hasattr(tag, "get") else "") or ""
|
||||
if name == "div" and typ.lower() == "cite":
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _parse_html(html: str) -> list[dict[str, Any]] | None:
|
||||
"""Walk top-level quote-container elements and recurse into nested ones.
|
||||
Returns None if no quote markers are present. Recognises <blockquote>
|
||||
plus the Gmail / Apple Mail / Yahoo / Outlook / Thunderbird wrappers
|
||||
that don't use <blockquote>."""
|
||||
if not html or len(html) > 200_000:
|
||||
return None
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
except Exception:
|
||||
return None # bs4 not installed → caller falls back to plaintext / client parse
|
||||
|
||||
try:
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
# Find all quote containers, then keep only the top-level ones (those
|
||||
# whose nearest ancestor that's also a quote container is None).
|
||||
all_quotes = [t for t in soup.find_all(True) if _is_quote_container(t)]
|
||||
if not all_quotes:
|
||||
return None
|
||||
|
||||
def has_quote_ancestor(t) -> bool:
|
||||
p = t.parent
|
||||
while p is not None:
|
||||
if _is_quote_container(p):
|
||||
return True
|
||||
p = p.parent
|
||||
return False
|
||||
|
||||
tops = [t for t in all_quotes if not has_quote_ancestor(t)]
|
||||
if not tops:
|
||||
return None
|
||||
|
||||
turns: list[dict[str, Any]] = []
|
||||
|
||||
# Collect the new-reply content from OUTSIDE the quote containers.
|
||||
# Most replies are top-posted (head), but Japanese / formal emails are
|
||||
# frequently bottom-posted (tail). Some users do both. We combine head
|
||||
# and tail into a single level-0 turn so the new content always shows
|
||||
# first, regardless of source-order position.
|
||||
parent_children = list(tops[0].parent.children if tops[0].parent else [])
|
||||
|
||||
head_nodes = []
|
||||
for sib in parent_children:
|
||||
if sib is tops[0]:
|
||||
break
|
||||
head_nodes.append(sib)
|
||||
|
||||
# Tail = everything after the LAST top-level quote at this parent level
|
||||
last_top = tops[-1]
|
||||
tail_nodes = []
|
||||
after_last = False
|
||||
for sib in parent_children:
|
||||
if sib is last_top:
|
||||
after_last = True
|
||||
continue
|
||||
# Skip any other top-level quotes between (they get walked below)
|
||||
if after_last and sib in tops:
|
||||
continue
|
||||
if after_last:
|
||||
tail_nodes.append(sib)
|
||||
|
||||
def _strip_trailing_attribution(html_chunk: str) -> str:
|
||||
text = re.sub(r"<[^>]+>", " ", html_chunk)
|
||||
if not (_WROTE_LINE_RE.search(text) or _ORIG_RE.search(text) or _CJK_ATTRIB_LINE_RE.search(text)):
|
||||
return html_chunk
|
||||
html_chunk = re.sub(
|
||||
rf"(?:<br\s*/?>|</p>|</div>|\n)?\s*On\s.+?\s{_WROTE}\s*:\s*(?:</[^>]+>)*\s*$",
|
||||
"",
|
||||
html_chunk,
|
||||
flags=re.IGNORECASE | re.DOTALL,
|
||||
)
|
||||
html_chunk = re.sub(
|
||||
r"(?:<br\s*/?>|</p>|</div>|\n)?\s*"
|
||||
r"(?:\d{4}[年/.-]\d{1,2}[月/.-]\d{1,2}日?(?:\s*[\(\(][^\)\)]+?[\)\)])?"
|
||||
r"\s+\d{1,2}:\d{2}(?:\s*[APAP][MM])?(?:に|、|,)?"
|
||||
r"\s*(?:.+?\s+)?[<<]?[\w.+\-]+@[\w.\-]+\.[A-Za-z]{2,}[>>]?"
|
||||
r"\s*(?:のメッセージ|さんは(?:書|お?書き)きました|wrote)?\s*[::]"
|
||||
r"\s*(?:</[^>]+>)*\s*$",
|
||||
"",
|
||||
html_chunk,
|
||||
flags=re.DOTALL,
|
||||
)
|
||||
return html_chunk
|
||||
|
||||
head_html = _strip_trailing_attribution("".join(str(n) for n in head_nodes))
|
||||
tail_html = "".join(str(n) for n in tail_nodes)
|
||||
|
||||
# Stitch head + tail. Tail (bottom-posted reply) goes first because
|
||||
# that's the most-recent / most-relevant content; head (which may just
|
||||
# be empty or a forwarded preamble) follows.
|
||||
parts = []
|
||||
if tail_html.strip(): parts.append(tail_html.strip())
|
||||
if head_html.strip(): parts.append(head_html.strip())
|
||||
if parts:
|
||||
turns.append({
|
||||
"level": 0,
|
||||
"body_html": "<br><br>".join(parts) if len(parts) > 1 else parts[0],
|
||||
"meta": None,
|
||||
})
|
||||
|
||||
def _walk(node, level: int):
|
||||
meta_from_node = _extract_quote_meta(str(node))
|
||||
# Recurse into nested quote containers inside this one, then strip
|
||||
# them so the body of THIS turn doesn't include them.
|
||||
nested = [t for t in node.find_all(True, recursive=True) if _is_quote_container(t)]
|
||||
# Keep only direct-quote descendants (no other quote container between)
|
||||
def has_quote_between(child, ancestor) -> bool:
|
||||
p = child.parent
|
||||
while p is not None and p is not ancestor:
|
||||
if _is_quote_container(p):
|
||||
return True
|
||||
p = p.parent
|
||||
return False
|
||||
direct_nested = [n for n in nested if not has_quote_between(n, node)]
|
||||
for n in list(direct_nested):
|
||||
n.extract()
|
||||
body_html = node.decode_contents()
|
||||
|
||||
# Collapse "wrapper-only" quote containers: if the only remaining
|
||||
# content of this node (after pulling out nested quotes) is an
|
||||
# attribution line, don't emit a separate turn for it. Instead,
|
||||
# pass the attribution down as meta for the directly-nested child.
|
||||
# Without this collapse, gmail_quote_container produces a phantom
|
||||
# bubble that contains just the JP/EN attribution line.
|
||||
body_text = re.sub(r"<[^>]+>", " ", body_html).strip()
|
||||
body_text = _html.unescape(body_text)
|
||||
body_text_collapsed = re.sub(r"\s+", " ", body_text).strip()
|
||||
is_attrib_only = bool(body_text_collapsed) and (
|
||||
_CJK_ATTRIB_LINE_RE.match(body_text_collapsed)
|
||||
or re.match(rf"^\s*On\s.+?\s{_WROTE}\s*:\s*$", body_text_collapsed, re.IGNORECASE)
|
||||
or _OUTLOOK_HEADER_RE.match(body_text_collapsed)
|
||||
)
|
||||
if is_attrib_only and len(direct_nested) == 1:
|
||||
# Skip emitting this wrapper. Pass attribution as meta for child.
|
||||
child_meta = meta_from_node or body_text_collapsed
|
||||
# Recurse into child as the SAME level (replacing this wrapper)
|
||||
_walk_with_meta(direct_nested[0], level, child_meta)
|
||||
return
|
||||
|
||||
turns.append({"level": level, "body_html": body_html, "meta": meta_from_node})
|
||||
for n in direct_nested:
|
||||
_walk(n, level + 1)
|
||||
|
||||
def _walk_with_meta(node, level: int, forced_meta: str):
|
||||
"""Variant that uses a passed-in meta when the node's own meta is empty."""
|
||||
meta_from_node = _extract_quote_meta(str(node)) or forced_meta
|
||||
nested = [t for t in node.find_all(True, recursive=True) if _is_quote_container(t)]
|
||||
def has_quote_between(child, ancestor) -> bool:
|
||||
p = child.parent
|
||||
while p is not None and p is not ancestor:
|
||||
if _is_quote_container(p):
|
||||
return True
|
||||
p = p.parent
|
||||
return False
|
||||
direct_nested = [n for n in nested if not has_quote_between(n, node)]
|
||||
for n in list(direct_nested):
|
||||
n.extract()
|
||||
body_html = node.decode_contents()
|
||||
turns.append({"level": level, "body_html": body_html, "meta": meta_from_node})
|
||||
for n in direct_nested:
|
||||
_walk(n, level + 1)
|
||||
|
||||
for bq in tops:
|
||||
_walk(bq, 1)
|
||||
|
||||
if not turns:
|
||||
return None
|
||||
return turns
|
||||
|
||||
|
||||
def parse_thread(body_html: str | None, body_text: str | None) -> list[dict[str, Any]] | None:
|
||||
"""Public entry point. Prefer HTML when available, else plaintext.
|
||||
Returns None if no quoted material found (caller renders flat)."""
|
||||
if body_html:
|
||||
out = _parse_html(body_html)
|
||||
if out:
|
||||
return out
|
||||
if body_text:
|
||||
return _parse_plaintext(body_text)
|
||||
return None
|
||||
Reference in New Issue
Block a user