From e678ff753f12de58ca5bb1ce64d11e044d47ef20 Mon Sep 17 00:00:00 2001 From: Shaw Date: Wed, 3 Jun 2026 01:24:20 -0400 Subject: [PATCH] fix(email): guard _decode_header against unknown MIME charset (#1354) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A header that declares an unknown or invalid MIME charset (e.g. a malformed or spam Subject like =?x-unknown-charset?B?...?=) raised an uncaught LookupError. bytes.decode(..., errors="replace") only handles byte-decode errors, not codec *lookup* failures, so the "replace" safety net did not apply. _decode_header decodes Subject/From/To/Cc for the inbox list, single-message fetch, and the background mail pollers (routes/email_routes.py, routes/email_pollers.py, src/builtin_actions.py), so a single bad message could crash the whole inbox render or the poller loop. Wrap the per-part decode in try/except (LookupError, ValueError) and fall back to utf-8/replace. Valid charsets (utf-8, iso-8859-1, ...) are unchanged. Adds tests/test_email_decode_header.py — the unknown-charset case fails before this change and passes after. --- routes/email_helpers.py | 8 ++++- tests/test_email_decode_header.py | 51 +++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 tests/test_email_decode_header.py diff --git a/routes/email_helpers.py b/routes/email_helpers.py index 180ebf9..409c6c4 100644 --- a/routes/email_helpers.py +++ b/routes/email_helpers.py @@ -751,7 +751,13 @@ def _decode_header(raw): decoded = [] for data, charset in parts: if isinstance(data, bytes): - decoded.append(data.decode(charset or "utf-8", errors="replace")) + try: + decoded.append(data.decode(charset or "utf-8", errors="replace")) + except (LookupError, ValueError): + # Unknown/invalid MIME charset (e.g. a malformed or spam header + # like =?x-unknown-charset?B?...?=). errors="replace" only covers + # byte-decode errors, not codec lookup, so fall back to utf-8. + decoded.append(data.decode("utf-8", errors="replace")) else: decoded.append(data) return " ".join(decoded) diff --git a/tests/test_email_decode_header.py b/tests/test_email_decode_header.py new file mode 100644 index 0000000..de45293 --- /dev/null +++ b/tests/test_email_decode_header.py @@ -0,0 +1,51 @@ +"""Regression tests for routes.email_helpers._decode_header. + +A single email whose Subject/From/To/Cc header declares an unknown or invalid +MIME charset (e.g. `=?x-unknown-charset?B?...?=`, common in spam/malformed mail) +used to raise an uncaught LookupError, because `bytes.decode(..., errors="replace")` +only handles byte-decode errors — not codec *lookup* failures. That crash +propagated into the inbox list endpoint, message fetch, and the background mail +pollers (routes/email_routes.py, routes/email_pollers.py, src/builtin_actions.py), +so one bad message could take down the whole inbox render / poller loop. + +These pin the fallback so a bogus charset degrades gracefully to utf-8. +""" +import os +import tempfile +from pathlib import Path + +_tmp_data = Path(tempfile.mkdtemp(prefix="odysseus_decode_hdr_")) +os.environ.setdefault("DATA_DIR", str(_tmp_data)) +os.environ.setdefault("DATABASE_URL", f"sqlite:///{_tmp_data / 'app.db'}") + +from routes.email_helpers import _decode_header + + +def test_unknown_charset_does_not_raise(): + # The regression: an unknown codec name must not raise LookupError. + assert _decode_header("=?x-unknown-charset?B?aGVsbG8=?=") == "hello" + + +def test_invalid_charset_falls_back_to_utf8(): + # A made-up charset on non-ASCII bytes should still produce a string. + raw = "=?totally-bogus?Q?caf=C3=A9?=" + out = _decode_header(raw) + assert isinstance(out, str) + assert "caf" in out + + +def test_valid_utf8_unchanged(): + assert _decode_header("=?utf-8?B?SGVsbG8gV29ybGQ=?=") == "Hello World" + + +def test_valid_iso8859_1_unchanged(): + assert _decode_header("=?iso-8859-1?Q?caf=E9?=") == "café" + + +def test_plain_ascii_passthrough(): + assert _decode_header("Just a subject") == "Just a subject" + + +def test_empty_and_none(): + assert _decode_header("") == "" + assert _decode_header(None) == ""