Decode email headers without injected spaces
Use email.header.make_header for MIME header decoding so adjacent encoded/plain header parts preserve RFC spacing, with regression coverage.
This commit is contained in:
@@ -337,14 +337,25 @@ def _decode_header(raw):
|
||||
"""Decode MIME encoded header."""
|
||||
if not raw:
|
||||
return ""
|
||||
parts = email.header.decode_header(raw)
|
||||
decoded = []
|
||||
for data, charset in parts:
|
||||
if isinstance(data, bytes):
|
||||
decoded.append(data.decode(charset or "utf-8", errors="replace"))
|
||||
else:
|
||||
decoded.append(data)
|
||||
return " ".join(decoded)
|
||||
try:
|
||||
# make_header concatenates per RFC 2047: no spurious space between an
|
||||
# encoded-word and adjacent plain text (plain runs keep their own
|
||||
# whitespace), and whitespace between two adjacent encoded-words is
|
||||
# dropped. The old " ".join produced "Re: Jose" style double spaces
|
||||
# on every non-ASCII subject or sender.
|
||||
return str(email.header.make_header(email.header.decode_header(raw)))
|
||||
except Exception:
|
||||
# Malformed header or unknown charset: lossy per-part decode
|
||||
decoded = []
|
||||
for data, charset in email.header.decode_header(raw):
|
||||
if isinstance(data, bytes):
|
||||
try:
|
||||
decoded.append(data.decode(charset or "utf-8", errors="replace"))
|
||||
except LookupError:
|
||||
decoded.append(data.decode("utf-8", errors="replace"))
|
||||
else:
|
||||
decoded.append(data)
|
||||
return "".join(decoded)
|
||||
|
||||
|
||||
def _extract_text(msg):
|
||||
|
||||
34
tests/test_mcp_email_decode_header_spaces.py
Normal file
34
tests/test_mcp_email_decode_header_spaces.py
Normal file
@@ -0,0 +1,34 @@
|
||||
"""mcp email server _decode_header must not inject spaces between parts.
|
||||
|
||||
email.header.decode_header returns plain-text runs WITH their surrounding
|
||||
whitespace (e.g. (b"Re: ", None)), so joining parts with " " produced a
|
||||
double space after "Re:" on every non-ASCII subject, a spurious space in
|
||||
"Name <addr>" senders, and violated RFC 2047 6.2 which requires whitespace
|
||||
between two adjacent encoded-words to be dropped.
|
||||
"""
|
||||
import pytest
|
||||
|
||||
pytest.importorskip("mcp")
|
||||
|
||||
import mcp_servers.email_server as es
|
||||
|
||||
|
||||
def test_prefix_then_encoded_word_single_space():
|
||||
assert es._decode_header("Re: =?utf-8?b?SsOzc2U=?=") == "Re: J\u00f3se"
|
||||
|
||||
|
||||
def test_encoded_word_then_plain_text():
|
||||
assert es._decode_header("=?utf-8?b?SsOzc2U=?= Smith") == "J\u00f3se Smith"
|
||||
|
||||
|
||||
def test_adjacent_encoded_words_join_without_space():
|
||||
out = es._decode_header("=?iso-8859-1?q?Caf=E9?= =?utf-8?b?5pel5pys?=")
|
||||
assert out == "Caf\u00e9\u65e5\u672c"
|
||||
|
||||
|
||||
def test_plain_ascii_header_unchanged():
|
||||
assert es._decode_header("Weekly report") == "Weekly report"
|
||||
|
||||
|
||||
def test_empty_header():
|
||||
assert es._decode_header("") == ""
|
||||
Reference in New Issue
Block a user