diff --git a/requirements.txt b/requirements.txt index e4630d1..2c40729 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,6 +21,10 @@ youtube-transcript-api # Markdown rendering for research reports (src/visual_report.py). # Imported at module-top so it's a hard core dep, not optional. markdown +# HTML sanitizer for rendered research reports (src/visual_report.py). Report +# content is untrusted (LLM output over crawled pages) and report pages run +# under a relaxed CSP, so the rendered HTML is allowlist-sanitized. +nh3 # Calendar .ics import/export (routes/calendar_routes.py). icalendar # Recurrence rule expansion for calendar events (routes/calendar_routes.py). diff --git a/src/visual_report.py b/src/visual_report.py index 70af4b2..b15c800 100644 --- a/src/visual_report.py +++ b/src/visual_report.py @@ -25,9 +25,27 @@ from src.research_utils import strip_thinking from urllib.parse import urlparse import markdown +import nh3 logger = logging.getLogger(__name__) +# Tags/attributes permitted in rendered research-report HTML. Starts from nh3's +# safe defaults (which drop ", + '', + "", + 'x', +]) +def test_md_to_html_strips_active_content(payload): + from src.visual_report import _md_to_html + + out = _md_to_html(f"Report body.\n\n{payload}").lower() + + assert "\nRaw findings\n\ncontent\n\n" + ) + out = _md_to_html(md) + + assert "

on a + # report page served under `script-src 'unsafe-inline'`, so it must be escaped + # or it's an attribute-injection XSS independent of the markdown body. + from src.visual_report import generate_visual_report + + html = generate_visual_report( + question="q", + report_markdown="## H\n\nbody", + category='">', + ) + + assert "