Fix visual report chapter navigation (#505)

Co-authored-by: Alex Kenley <Alex.Kenley@threatvectorsecurity.com>
This commit is contained in:
Alexander Kenley
2026-06-01 23:26:13 +10:00
committed by GitHub
parent 6ad617931d
commit 07d92556a3
2 changed files with 82 additions and 8 deletions

View File

@@ -19,6 +19,8 @@ import re
from datetime import datetime
from typing import Dict, List, Optional, Tuple
from bs4 import BeautifulSoup
from src.research_utils import strip_thinking
from urllib.parse import urlparse
@@ -68,8 +70,20 @@ def _extract_headings(md_text: str) -> List[Dict[str, str]]:
headings = []
seen_slugs: Dict[str, int] = {}
def _plain_heading_text(text: str) -> str:
text = text.strip().rstrip("#").strip()
text = re.sub(r'!\[([^\]]*)\]\([^)]+\)', r'\1', text)
text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
text = re.sub(r'\[([^\]]+)\]\[[^\]]+\]', r'\1', text)
text = re.sub(r'<[^>]+>', '', text)
text = re.sub(r'[`*_~]+', '', text)
text = html.unescape(text)
return re.sub(r'\s+', ' ', text).strip()
def _make_slug(text: str) -> str:
slug = re.sub(r'[^a-z0-9]+', '-', text.lower()).strip('-')
if not slug:
slug = "section"
if slug in seen_slugs:
seen_slugs[slug] += 1
slug = f"{slug}-{seen_slugs[slug]}"
@@ -79,16 +93,43 @@ def _extract_headings(md_text: str) -> List[Dict[str, str]]:
for m in re.finditer(r'^(#{2,3})\s+(.+)$', md_text, re.MULTILINE):
level = len(m.group(1))
text = m.group(2).strip()
text = _plain_heading_text(m.group(2))
if not text:
continue
headings.append({"level": level, "text": text, "slug": _make_slug(text)})
if not headings:
for m in re.finditer(r'^\*\*([^*]+)\*\*\s*$', md_text, re.MULTILINE):
text = m.group(1).strip().rstrip(':')
text = _plain_heading_text(m.group(1)).rstrip(':')
if 3 < len(text) < 80:
headings.append({"level": 2, "text": text, "slug": _make_slug(text)})
return headings
def _apply_heading_ids(report_html: str, headings: List[Dict[str, str]]) -> str:
"""Force rendered h2/h3 IDs to match the generated sidebar links."""
if not headings:
return report_html
soup = BeautifulSoup(report_html, "html.parser")
rendered_headings = soup.find_all(["h2", "h3"])
for element, heading in zip(rendered_headings, headings):
expected_name = f"h{heading['level']}"
if element.name != expected_name:
logger.debug(
"Visual report heading level mismatch: rendered %s for TOC %s",
element.name,
expected_name,
)
element["id"] = heading["slug"]
if len(rendered_headings) != len(headings):
logger.debug(
"Visual report heading count mismatch: rendered=%s toc=%s",
len(rendered_headings),
len(headings),
)
return str(soup)
# Overlay buttons shown on each image: reroll (swap for the next unused
# scraped image) + hide (remove and skip on future renders). Reroll is
# wired up in the page script using the embedded spare-image pool.
@@ -1650,13 +1691,8 @@ def generate_visual_report(
report_html = _md_to_html(report_markdown)
# Add id anchors to h2/h3 for TOC linking
headings = _extract_headings(report_markdown)
for h in headings:
tag = f"h{h['level']}"
pattern = rf'(<{tag}>)(.*?{re.escape(html.escape(h["text"]))}.*?</{tag}>)'
replacement = rf'<{tag} id="{h["slug"]}">\2'
report_html = re.sub(pattern, replacement, report_html, count=1)
report_html = _apply_heading_ids(report_html, headings)
# Collect all OG images from sources (skip icons, tiny images, known junk)
_IMAGE_BLOCKLIST = {

View File

@@ -0,0 +1,38 @@
from bs4 import BeautifulSoup
from src.visual_report import generate_visual_report
def test_visual_report_toc_links_match_rendered_heading_ids():
report = """
# Automated Crypto Trading Bot Strategies
### **1.0 Introduction & Research Scope**
Intro body.
### **2.0 Determining the "Best" Configuration**
Configuration body.
"""
html = generate_visual_report(
"crypto bot strategies",
report,
sources=[],
stats={},
session_id="rp-test",
)
soup = BeautifulSoup(html, "html.parser")
links = soup.select(".toc-sidebar nav a")
assert [link.get_text(strip=True) for link in links] == [
"1.0 Introduction & Research Scope",
'2.0 Determining the "Best" Configuration',
]
for link in links:
target_id = link["href"].removeprefix("#")
target = soup.find(id=target_id)
assert target is not None
assert target.name in {"h2", "h3"}