Fix visual report chapter navigation (#505)

Co-authored-by: Alex Kenley <Alex.Kenley@threatvectorsecurity.com>
2026-06-01 23:26:13 +10:00
parent 6ad617931d
commit 07d92556a3
2 changed files with 82 additions and 8 deletions
--- a/src/visual_report.py
+++ b/src/visual_report.py
@@ -19,6 +19,8 @@ import re
 from datetime import datetime
 from typing import Dict, List, Optional, Tuple
 from bs4 import BeautifulSoup
 from src.research_utils import strip_thinking
 from urllib.parse import urlparse
@@ -68,8 +70,20 @@ def _extract_headings(md_text: str) -> List[Dict[str, str]]:
    headings = []
    seen_slugs: Dict[str, int] = {}
    def _plain_heading_text(text: str) -> str:
        text = text.strip().rstrip("#").strip()
        text = re.sub(r'!\[([^\]]*)\]\([^)]+\)', r'\1', text)
        text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
        text = re.sub(r'\[([^\]]+)\]\[[^\]]+\]', r'\1', text)
        text = re.sub(r'<[^>]+>', '', text)
        text = re.sub(r'[`*_~]+', '', text)
        text = html.unescape(text)
        return re.sub(r'\s+', ' ', text).strip()
    def _make_slug(text: str) -> str:
        slug = re.sub(r'[^a-z0-9]+', '-', text.lower()).strip('-')
        if not slug:
            slug = "section"
        if slug in seen_slugs:
            seen_slugs[slug] += 1
            slug = f"{slug}-{seen_slugs[slug]}"
@@ -79,16 +93,43 @@ def _extract_headings(md_text: str) -> List[Dict[str, str]]:
    for m in re.finditer(r'^(#{2,3})\s+(.+)$', md_text, re.MULTILINE):
        level = len(m.group(1))
-        text = m.group(2).strip()
+        text = _plain_heading_text(m.group(2))
        if not text:
            continue
        headings.append({"level": level, "text": text, "slug": _make_slug(text)})
    if not headings:
        for m in re.finditer(r'^\*\*([^*]+)\*\*\s*$', md_text, re.MULTILINE):
-            text = m.group(1).strip().rstrip(':')
+            text = _plain_heading_text(m.group(1)).rstrip(':')
            if 3 < len(text) < 80:
                headings.append({"level": 2, "text": text, "slug": _make_slug(text)})
    return headings
 def _apply_heading_ids(report_html: str, headings: List[Dict[str, str]]) -> str:
    """Force rendered h2/h3 IDs to match the generated sidebar links."""
    if not headings:
        return report_html
    soup = BeautifulSoup(report_html, "html.parser")
    rendered_headings = soup.find_all(["h2", "h3"])
    for element, heading in zip(rendered_headings, headings):
        expected_name = f"h{heading['level']}"
        if element.name != expected_name:
            logger.debug(
                "Visual report heading level mismatch: rendered %s for TOC %s",
                element.name,
                expected_name,
            )
        element["id"] = heading["slug"]
    if len(rendered_headings) != len(headings):
        logger.debug(
            "Visual report heading count mismatch: rendered=%s toc=%s",
            len(rendered_headings),
            len(headings),
        )
    return str(soup)
 # Overlay buttons shown on each image: reroll (swap for the next unused
 # scraped image) + hide (remove and skip on future renders). Reroll is
 # wired up in the page script using the embedded spare-image pool.
@@ -1650,13 +1691,8 @@ def generate_visual_report(
    report_html = _md_to_html(report_markdown)
    # Add id anchors to h2/h3 for TOC linking
    headings = _extract_headings(report_markdown)
-    for h in headings:
+    report_html = _apply_heading_ids(report_html, headings)
        tag = f"h{h['level']}"
        pattern = rf'(<{tag}>)(.*?{re.escape(html.escape(h["text"]))}.*?</{tag}>)'
        replacement = rf'<{tag} id="{h["slug"]}">\2'
        report_html = re.sub(pattern, replacement, report_html, count=1)
    # Collect all OG images from sources (skip icons, tiny images, known junk)
    _IMAGE_BLOCKLIST = {
--- a/tests/test_visual_report.py
+++ b/tests/test_visual_report.py
@@ -0,0 +1,38 @@
 from bs4 import BeautifulSoup
 from src.visual_report import generate_visual_report
 def test_visual_report_toc_links_match_rendered_heading_ids():
    report = """
 # Automated Crypto Trading Bot Strategies
 ### **1.0 Introduction & Research Scope**
 Intro body.
 ### **2.0 Determining the "Best" Configuration**
 Configuration body.
 """
    html = generate_visual_report(
        "crypto bot strategies",
        report,
        sources=[],
        stats={},
        session_id="rp-test",
    )
    soup = BeautifulSoup(html, "html.parser")
    links = soup.select(".toc-sidebar nav a")
    assert [link.get_text(strip=True) for link in links] == [
        "1.0 Introduction & Research Scope",
        '2.0 Determining the "Best" Configuration',
    ]
    for link in links:
        target_id = link["href"].removeprefix("#")
        target = soup.find(id=target_id)
        assert target is not None
        assert target.name in {"h2", "h3"}