Sanitize preserved markdown HTML

`mdToHtml` deliberately stashes literal <details> blocks and <a> tags from the source text *before* the global HTML-escape pass and restores them verbatim into the string callers assign to `innerHTML` (e.g. chatRenderer's `b.innerHTML = ...processWithThinking(text)`). Nothing scrubbed those fragments, so message/agent content containing `<details><img src=x onerror=...></details>` or `<a href="javascript:..." onmouseover=...>` executed arbitrary script in the authenticated page. Route both stashed fragments through `sanitizeAllowedHtml()`, which parses them in an inert <template> (no resource loads, no script execution), removes script-capable elements, and strips event-handler attributes plus javascript:/vbscript:/data: URL schemes. Hardening details: - Compare tag names case-insensitively and drop the SVG/MathML foreign- content roots. An SVG-namespaced <script> has the lower-case tagName 'script', so an HTML-only upper-case check would miss it — a real bypass. - Sanitize to a fixpoint (re-parse + re-clean until stable) to blunt mutation-XSS, where re-serializing/re-parsing reshapes the tree. Benign anchors and <details> blocks are preserved unchanged. Verified under jsdom against the obvious vectors plus mutation-XSS probes (svg/math-namespaced <script>, foreignObject, ns-confusion, comment breakout, template smuggling): no script/iframe element, event handler, or javascript:/data: URL survives, and benign markup is kept. Co-authored-by: Claude <noreply@anthropic.com>
2026-06-01 22:58:38 +02:00
parent 491a8a5480
commit b70ae56ffa
1 changed files with 79 additions and 2 deletions
--- a/static/js/markdown.js
+++ b/static/js/markdown.js
@@ -34,6 +34,83 @@ function linkHtml(text, url) {
  return `<a href="${escapeHtml(safeUrl)}" target="_blank" rel="noopener noreferrer">${safeText}</a>`;
 }

+/**
+ * Sanitize the raw-HTML fragments that mdToHtml deliberately preserves from
+ * the source text — <details> blocks (collapsible agent output) and <a> tags
+ * (emitted by the markdown link pass). Those fragments are later restored
+ * verbatim into innerHTML, so without scrubbing them a model — or any content
+ * routed through here — could smuggle in an `<img onerror=...>`, an
+ * `<a href="javascript:...">`, an `onmouseover=` handler, etc. and execute
+ * script in the authenticated page (DOM XSS).
+ *
+ * Parsing into a <template> is inert: assigning to template.innerHTML neither
+ * fetches resources nor runs scripts, so we can walk the resulting tree,
+ * drop script-capable elements, and strip event-handler attributes and
+ * dangerous URL schemes before the (now safe) fragment is handed back.
+ */
+const _ALLOWED_HTML_BAD_TAGS = new Set([
+  'SCRIPT', 'IFRAME', 'OBJECT', 'EMBED', 'LINK', 'META',
+  'STYLE', 'BASE', 'FORM', 'NOSCRIPT', 'TEMPLATE',
+  // Foreign-content roots. SVG/MathML have their own parser rules and are a
+  // classic mutation-XSS vehicle — e.g. an SVG-namespaced <script>, whose
+  // `tagName` is the lower-case 'script' and would slip a name check that
+  // assumed HTML's upper-casing. They aren't needed in the <details>/<a>
+  // fragments we preserve, so drop the whole subtree.
+  'SVG', 'MATH',
+]);
+const _ALLOWED_HTML_URL_ATTRS = new Set([
+  'href', 'src', 'xlink:href', 'action', 'formaction', 'background', 'poster',
+]);
+
+function _cleanAllowedHtmlOnce(htmlString) {
+  const tpl = document.createElement('template');
+  tpl.innerHTML = htmlString;
+  for (const el of Array.from(tpl.content.querySelectorAll('*'))) {
+    // Upper-case the tag for comparison: HTML tagNames are upper-case, but
+    // SVG/MathML elements preserve their original (lower/camel) case, so a
+    // raw `Set.has(el.tagName)` would miss e.g. a namespaced <script>.
+    if (_ALLOWED_HTML_BAD_TAGS.has(el.tagName.toUpperCase())) {
+      el.remove();
+      continue;
+    }
+    for (const attr of Array.from(el.attributes)) {
+      const name = attr.name.toLowerCase();
+      // Drop every inline event handler (onerror, onclick, onmouseover, ...)
+      // and srcdoc (a frame-less script vector).
+      if (name.startsWith('on') || name === 'srcdoc') {
+        el.removeAttribute(attr.name);
+        continue;
+      }
+      // Neutralize javascript:/vbscript:/data: in URL-bearing attributes.
+      // Strip control/space chars first so e.g. "java\tscript:" can't slip by.
+      if (_ALLOWED_HTML_URL_ATTRS.has(name)) {
+        const value = (attr.value || '').replace(/[\x00-\x20]+/g, '').toLowerCase();
+        if (/^(javascript|vbscript|data):/.test(value)) {
+          el.removeAttribute(attr.name);
+        }
+      }
+    }
+  }
+  return tpl.innerHTML;
+}
+
+function sanitizeAllowedHtml(html) {
+  const raw = String(html == null ? '' : html);
+  // Non-browser context (e.g. a future SSR/Node import): fail closed by
+  // escaping rather than trusting the markup.
+  if (typeof document === 'undefined') return escapeHtml(raw);
+
+  // Sanitize to a fixpoint. Re-parsing the serialized output can mutate the
+  // tree (the basis of mutation-XSS), so re-clean until it stops changing.
+  let out = raw;
+  for (let i = 0; i < 4; i++) {
+    const next = _cleanAllowedHtmlOnce(out);
+    if (next === out) break;
+    out = next;
+  }
+  return out;
+}
+
 /**
 * Check if text has unclosed think tag
 */
@@ -356,14 +433,14 @@ export function mdToHtml(src) {
  // Default to open so agent output is visible
  s = s.replace(/<details>([\s\S]*?)<\/details>/gi, (match) => {
    const placeholder = `___ALLOWED_HTML_${allowedHtmlBlocks.length}___`;
-    allowedHtmlBlocks.push(match.replace(/<details>/i, '<details open>'));
+    allowedHtmlBlocks.push(sanitizeAllowedHtml(match.replace(/<details>/i, '<details open>')));
    return placeholder;
  });

  // ALSO preserve <a> tags the same way (they're now in the HTML from markdown conversion)
  s = s.replace(/<a\s+[^>]*>.*?<\/a>/gi, (match) => {
    const placeholder = `___ALLOWED_HTML_${allowedHtmlBlocks.length}___`;
-    allowedHtmlBlocks.push(match);
+    allowedHtmlBlocks.push(sanitizeAllowedHtml(match));
    return placeholder;
  });