Sanitize preserved markdown HTML

`mdToHtml` deliberately stashes literal <details> blocks and <a> tags from
the source text *before* the global HTML-escape pass and restores them
verbatim into the string callers assign to `innerHTML` (e.g. chatRenderer's
`b.innerHTML = ...processWithThinking(text)`). Nothing scrubbed those
fragments, so message/agent content containing
`<details><img src=x onerror=...></details>` or
`<a href="javascript:..." onmouseover=...>` executed arbitrary script in
the authenticated page.

Route both stashed fragments through `sanitizeAllowedHtml()`, which parses
them in an inert <template> (no resource loads, no script execution),
removes script-capable elements, and strips event-handler attributes plus
javascript:/vbscript:/data: URL schemes. Hardening details:

- Compare tag names case-insensitively and drop the SVG/MathML foreign-
  content roots. An SVG-namespaced <script> has the lower-case tagName
  'script', so an HTML-only upper-case check would miss it — a real bypass.
- Sanitize to a fixpoint (re-parse + re-clean until stable) to blunt
  mutation-XSS, where re-serializing/re-parsing reshapes the tree.

Benign anchors and <details> blocks are preserved unchanged.

Verified under jsdom against the obvious vectors plus mutation-XSS probes
(svg/math-namespaced <script>, foreignObject, ns-confusion, comment
breakout, template smuggling): no script/iframe element, event handler, or
javascript:/data: URL survives, and benign markup is kept.

Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
SurprisedDuck
2026-06-01 22:58:38 +02:00
committed by GitHub
parent 491a8a5480
commit b70ae56ffa

View File

@@ -34,6 +34,83 @@ function linkHtml(text, url) {
return `<a href="${escapeHtml(safeUrl)}" target="_blank" rel="noopener noreferrer">${safeText}</a>`;
}
/**
* Sanitize the raw-HTML fragments that mdToHtml deliberately preserves from
* the source text — <details> blocks (collapsible agent output) and <a> tags
* (emitted by the markdown link pass). Those fragments are later restored
* verbatim into innerHTML, so without scrubbing them a model — or any content
* routed through here — could smuggle in an `<img onerror=...>`, an
* `<a href="javascript:...">`, an `onmouseover=` handler, etc. and execute
* script in the authenticated page (DOM XSS).
*
* Parsing into a <template> is inert: assigning to template.innerHTML neither
* fetches resources nor runs scripts, so we can walk the resulting tree,
* drop script-capable elements, and strip event-handler attributes and
* dangerous URL schemes before the (now safe) fragment is handed back.
*/
const _ALLOWED_HTML_BAD_TAGS = new Set([
'SCRIPT', 'IFRAME', 'OBJECT', 'EMBED', 'LINK', 'META',
'STYLE', 'BASE', 'FORM', 'NOSCRIPT', 'TEMPLATE',
// Foreign-content roots. SVG/MathML have their own parser rules and are a
// classic mutation-XSS vehicle — e.g. an SVG-namespaced <script>, whose
// `tagName` is the lower-case 'script' and would slip a name check that
// assumed HTML's upper-casing. They aren't needed in the <details>/<a>
// fragments we preserve, so drop the whole subtree.
'SVG', 'MATH',
]);
const _ALLOWED_HTML_URL_ATTRS = new Set([
'href', 'src', 'xlink:href', 'action', 'formaction', 'background', 'poster',
]);
function _cleanAllowedHtmlOnce(htmlString) {
const tpl = document.createElement('template');
tpl.innerHTML = htmlString;
for (const el of Array.from(tpl.content.querySelectorAll('*'))) {
// Upper-case the tag for comparison: HTML tagNames are upper-case, but
// SVG/MathML elements preserve their original (lower/camel) case, so a
// raw `Set.has(el.tagName)` would miss e.g. a namespaced <script>.
if (_ALLOWED_HTML_BAD_TAGS.has(el.tagName.toUpperCase())) {
el.remove();
continue;
}
for (const attr of Array.from(el.attributes)) {
const name = attr.name.toLowerCase();
// Drop every inline event handler (onerror, onclick, onmouseover, ...)
// and srcdoc (a frame-less script vector).
if (name.startsWith('on') || name === 'srcdoc') {
el.removeAttribute(attr.name);
continue;
}
// Neutralize javascript:/vbscript:/data: in URL-bearing attributes.
// Strip control/space chars first so e.g. "java\tscript:" can't slip by.
if (_ALLOWED_HTML_URL_ATTRS.has(name)) {
const value = (attr.value || '').replace(/[\x00-\x20]+/g, '').toLowerCase();
if (/^(javascript|vbscript|data):/.test(value)) {
el.removeAttribute(attr.name);
}
}
}
}
return tpl.innerHTML;
}
function sanitizeAllowedHtml(html) {
const raw = String(html == null ? '' : html);
// Non-browser context (e.g. a future SSR/Node import): fail closed by
// escaping rather than trusting the markup.
if (typeof document === 'undefined') return escapeHtml(raw);
// Sanitize to a fixpoint. Re-parsing the serialized output can mutate the
// tree (the basis of mutation-XSS), so re-clean until it stops changing.
let out = raw;
for (let i = 0; i < 4; i++) {
const next = _cleanAllowedHtmlOnce(out);
if (next === out) break;
out = next;
}
return out;
}
/**
* Check if text has unclosed think tag
*/
@@ -356,14 +433,14 @@ export function mdToHtml(src) {
// Default to open so agent output is visible
s = s.replace(/<details>([\s\S]*?)<\/details>/gi, (match) => {
const placeholder = `___ALLOWED_HTML_${allowedHtmlBlocks.length}___`;
allowedHtmlBlocks.push(match.replace(/<details>/i, '<details open>'));
allowedHtmlBlocks.push(sanitizeAllowedHtml(match.replace(/<details>/i, '<details open>')));
return placeholder;
});
// ALSO preserve <a> tags the same way (they're now in the HTML from markdown conversion)
s = s.replace(/<a\s+[^>]*>.*?<\/a>/gi, (match) => {
const placeholder = `___ALLOWED_HTML_${allowedHtmlBlocks.length}___`;
allowedHtmlBlocks.push(match);
allowedHtmlBlocks.push(sanitizeAllowedHtml(match));
return placeholder;
});