Sanitize preserved markdown HTML
`mdToHtml` deliberately stashes literal <details> blocks and <a> tags from the source text *before* the global HTML-escape pass and restores them verbatim into the string callers assign to `innerHTML` (e.g. chatRenderer's `b.innerHTML = ...processWithThinking(text)`). Nothing scrubbed those fragments, so message/agent content containing `<details><img src=x onerror=...></details>` or `<a href="javascript:..." onmouseover=...>` executed arbitrary script in the authenticated page. Route both stashed fragments through `sanitizeAllowedHtml()`, which parses them in an inert <template> (no resource loads, no script execution), removes script-capable elements, and strips event-handler attributes plus javascript:/vbscript:/data: URL schemes. Hardening details: - Compare tag names case-insensitively and drop the SVG/MathML foreign- content roots. An SVG-namespaced <script> has the lower-case tagName 'script', so an HTML-only upper-case check would miss it — a real bypass. - Sanitize to a fixpoint (re-parse + re-clean until stable) to blunt mutation-XSS, where re-serializing/re-parsing reshapes the tree. Benign anchors and <details> blocks are preserved unchanged. Verified under jsdom against the obvious vectors plus mutation-XSS probes (svg/math-namespaced <script>, foreignObject, ns-confusion, comment breakout, template smuggling): no script/iframe element, event handler, or javascript:/data: URL survives, and benign markup is kept. Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -34,6 +34,83 @@ function linkHtml(text, url) {
|
||||
return `<a href="${escapeHtml(safeUrl)}" target="_blank" rel="noopener noreferrer">${safeText}</a>`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sanitize the raw-HTML fragments that mdToHtml deliberately preserves from
|
||||
* the source text — <details> blocks (collapsible agent output) and <a> tags
|
||||
* (emitted by the markdown link pass). Those fragments are later restored
|
||||
* verbatim into innerHTML, so without scrubbing them a model — or any content
|
||||
* routed through here — could smuggle in an `<img onerror=...>`, an
|
||||
* `<a href="javascript:...">`, an `onmouseover=` handler, etc. and execute
|
||||
* script in the authenticated page (DOM XSS).
|
||||
*
|
||||
* Parsing into a <template> is inert: assigning to template.innerHTML neither
|
||||
* fetches resources nor runs scripts, so we can walk the resulting tree,
|
||||
* drop script-capable elements, and strip event-handler attributes and
|
||||
* dangerous URL schemes before the (now safe) fragment is handed back.
|
||||
*/
|
||||
const _ALLOWED_HTML_BAD_TAGS = new Set([
|
||||
'SCRIPT', 'IFRAME', 'OBJECT', 'EMBED', 'LINK', 'META',
|
||||
'STYLE', 'BASE', 'FORM', 'NOSCRIPT', 'TEMPLATE',
|
||||
// Foreign-content roots. SVG/MathML have their own parser rules and are a
|
||||
// classic mutation-XSS vehicle — e.g. an SVG-namespaced <script>, whose
|
||||
// `tagName` is the lower-case 'script' and would slip a name check that
|
||||
// assumed HTML's upper-casing. They aren't needed in the <details>/<a>
|
||||
// fragments we preserve, so drop the whole subtree.
|
||||
'SVG', 'MATH',
|
||||
]);
|
||||
const _ALLOWED_HTML_URL_ATTRS = new Set([
|
||||
'href', 'src', 'xlink:href', 'action', 'formaction', 'background', 'poster',
|
||||
]);
|
||||
|
||||
function _cleanAllowedHtmlOnce(htmlString) {
|
||||
const tpl = document.createElement('template');
|
||||
tpl.innerHTML = htmlString;
|
||||
for (const el of Array.from(tpl.content.querySelectorAll('*'))) {
|
||||
// Upper-case the tag for comparison: HTML tagNames are upper-case, but
|
||||
// SVG/MathML elements preserve their original (lower/camel) case, so a
|
||||
// raw `Set.has(el.tagName)` would miss e.g. a namespaced <script>.
|
||||
if (_ALLOWED_HTML_BAD_TAGS.has(el.tagName.toUpperCase())) {
|
||||
el.remove();
|
||||
continue;
|
||||
}
|
||||
for (const attr of Array.from(el.attributes)) {
|
||||
const name = attr.name.toLowerCase();
|
||||
// Drop every inline event handler (onerror, onclick, onmouseover, ...)
|
||||
// and srcdoc (a frame-less script vector).
|
||||
if (name.startsWith('on') || name === 'srcdoc') {
|
||||
el.removeAttribute(attr.name);
|
||||
continue;
|
||||
}
|
||||
// Neutralize javascript:/vbscript:/data: in URL-bearing attributes.
|
||||
// Strip control/space chars first so e.g. "java\tscript:" can't slip by.
|
||||
if (_ALLOWED_HTML_URL_ATTRS.has(name)) {
|
||||
const value = (attr.value || '').replace(/[\x00-\x20]+/g, '').toLowerCase();
|
||||
if (/^(javascript|vbscript|data):/.test(value)) {
|
||||
el.removeAttribute(attr.name);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return tpl.innerHTML;
|
||||
}
|
||||
|
||||
function sanitizeAllowedHtml(html) {
|
||||
const raw = String(html == null ? '' : html);
|
||||
// Non-browser context (e.g. a future SSR/Node import): fail closed by
|
||||
// escaping rather than trusting the markup.
|
||||
if (typeof document === 'undefined') return escapeHtml(raw);
|
||||
|
||||
// Sanitize to a fixpoint. Re-parsing the serialized output can mutate the
|
||||
// tree (the basis of mutation-XSS), so re-clean until it stops changing.
|
||||
let out = raw;
|
||||
for (let i = 0; i < 4; i++) {
|
||||
const next = _cleanAllowedHtmlOnce(out);
|
||||
if (next === out) break;
|
||||
out = next;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if text has unclosed think tag
|
||||
*/
|
||||
@@ -356,14 +433,14 @@ export function mdToHtml(src) {
|
||||
// Default to open so agent output is visible
|
||||
s = s.replace(/<details>([\s\S]*?)<\/details>/gi, (match) => {
|
||||
const placeholder = `___ALLOWED_HTML_${allowedHtmlBlocks.length}___`;
|
||||
allowedHtmlBlocks.push(match.replace(/<details>/i, '<details open>'));
|
||||
allowedHtmlBlocks.push(sanitizeAllowedHtml(match.replace(/<details>/i, '<details open>')));
|
||||
return placeholder;
|
||||
});
|
||||
|
||||
// ALSO preserve <a> tags the same way (they're now in the HTML from markdown conversion)
|
||||
s = s.replace(/<a\s+[^>]*>.*?<\/a>/gi, (match) => {
|
||||
const placeholder = `___ALLOWED_HTML_${allowedHtmlBlocks.length}___`;
|
||||
allowedHtmlBlocks.push(match);
|
||||
allowedHtmlBlocks.push(sanitizeAllowedHtml(match));
|
||||
return placeholder;
|
||||
});
|
||||
|
||||
|
||||
Reference in New Issue
Block a user