Decode email headers without injected spaces
Use email.header.make_header for MIME header decoding so adjacent encoded/plain header parts preserve RFC spacing, with regression coverage.
This commit is contained in:
@@ -337,14 +337,25 @@ def _decode_header(raw):
|
|||||||
"""Decode MIME encoded header."""
|
"""Decode MIME encoded header."""
|
||||||
if not raw:
|
if not raw:
|
||||||
return ""
|
return ""
|
||||||
parts = email.header.decode_header(raw)
|
try:
|
||||||
decoded = []
|
# make_header concatenates per RFC 2047: no spurious space between an
|
||||||
for data, charset in parts:
|
# encoded-word and adjacent plain text (plain runs keep their own
|
||||||
if isinstance(data, bytes):
|
# whitespace), and whitespace between two adjacent encoded-words is
|
||||||
decoded.append(data.decode(charset or "utf-8", errors="replace"))
|
# dropped. The old " ".join produced "Re: Jose" style double spaces
|
||||||
else:
|
# on every non-ASCII subject or sender.
|
||||||
decoded.append(data)
|
return str(email.header.make_header(email.header.decode_header(raw)))
|
||||||
return " ".join(decoded)
|
except Exception:
|
||||||
|
# Malformed header or unknown charset: lossy per-part decode
|
||||||
|
decoded = []
|
||||||
|
for data, charset in email.header.decode_header(raw):
|
||||||
|
if isinstance(data, bytes):
|
||||||
|
try:
|
||||||
|
decoded.append(data.decode(charset or "utf-8", errors="replace"))
|
||||||
|
except LookupError:
|
||||||
|
decoded.append(data.decode("utf-8", errors="replace"))
|
||||||
|
else:
|
||||||
|
decoded.append(data)
|
||||||
|
return "".join(decoded)
|
||||||
|
|
||||||
|
|
||||||
def _extract_text(msg):
|
def _extract_text(msg):
|
||||||
|
|||||||
34
tests/test_mcp_email_decode_header_spaces.py
Normal file
34
tests/test_mcp_email_decode_header_spaces.py
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
"""mcp email server _decode_header must not inject spaces between parts.
|
||||||
|
|
||||||
|
email.header.decode_header returns plain-text runs WITH their surrounding
|
||||||
|
whitespace (e.g. (b"Re: ", None)), so joining parts with " " produced a
|
||||||
|
double space after "Re:" on every non-ASCII subject, a spurious space in
|
||||||
|
"Name <addr>" senders, and violated RFC 2047 6.2 which requires whitespace
|
||||||
|
between two adjacent encoded-words to be dropped.
|
||||||
|
"""
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
pytest.importorskip("mcp")
|
||||||
|
|
||||||
|
import mcp_servers.email_server as es
|
||||||
|
|
||||||
|
|
||||||
|
def test_prefix_then_encoded_word_single_space():
|
||||||
|
assert es._decode_header("Re: =?utf-8?b?SsOzc2U=?=") == "Re: J\u00f3se"
|
||||||
|
|
||||||
|
|
||||||
|
def test_encoded_word_then_plain_text():
|
||||||
|
assert es._decode_header("=?utf-8?b?SsOzc2U=?= Smith") == "J\u00f3se Smith"
|
||||||
|
|
||||||
|
|
||||||
|
def test_adjacent_encoded_words_join_without_space():
|
||||||
|
out = es._decode_header("=?iso-8859-1?q?Caf=E9?= =?utf-8?b?5pel5pys?=")
|
||||||
|
assert out == "Caf\u00e9\u65e5\u672c"
|
||||||
|
|
||||||
|
|
||||||
|
def test_plain_ascii_header_unchanged():
|
||||||
|
assert es._decode_header("Weekly report") == "Weekly report"
|
||||||
|
|
||||||
|
|
||||||
|
def test_empty_header():
|
||||||
|
assert es._decode_header("") == ""
|
||||||
Reference in New Issue
Block a user