From 07d92556a350fb6d0b709e8fbf0b6846ee851f95 Mon Sep 17 00:00:00 2001 From: Alexander Kenley Date: Mon, 1 Jun 2026 23:26:13 +1000 Subject: [PATCH] Fix visual report chapter navigation (#505) Co-authored-by: Alex Kenley --- src/visual_report.py | 52 +++++++++++++++++++++++++++++++------ tests/test_visual_report.py | 38 +++++++++++++++++++++++++++ 2 files changed, 82 insertions(+), 8 deletions(-) create mode 100644 tests/test_visual_report.py diff --git a/src/visual_report.py b/src/visual_report.py index 47cc55e..fa021cd 100644 --- a/src/visual_report.py +++ b/src/visual_report.py @@ -19,6 +19,8 @@ import re from datetime import datetime from typing import Dict, List, Optional, Tuple +from bs4 import BeautifulSoup + from src.research_utils import strip_thinking from urllib.parse import urlparse @@ -68,8 +70,20 @@ def _extract_headings(md_text: str) -> List[Dict[str, str]]: headings = [] seen_slugs: Dict[str, int] = {} + def _plain_heading_text(text: str) -> str: + text = text.strip().rstrip("#").strip() + text = re.sub(r'!\[([^\]]*)\]\([^)]+\)', r'\1', text) + text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text) + text = re.sub(r'\[([^\]]+)\]\[[^\]]+\]', r'\1', text) + text = re.sub(r'<[^>]+>', '', text) + text = re.sub(r'[`*_~]+', '', text) + text = html.unescape(text) + return re.sub(r'\s+', ' ', text).strip() + def _make_slug(text: str) -> str: slug = re.sub(r'[^a-z0-9]+', '-', text.lower()).strip('-') + if not slug: + slug = "section" if slug in seen_slugs: seen_slugs[slug] += 1 slug = f"{slug}-{seen_slugs[slug]}" @@ -79,16 +93,43 @@ def _extract_headings(md_text: str) -> List[Dict[str, str]]: for m in re.finditer(r'^(#{2,3})\s+(.+)$', md_text, re.MULTILINE): level = len(m.group(1)) - text = m.group(2).strip() + text = _plain_heading_text(m.group(2)) + if not text: + continue headings.append({"level": level, "text": text, "slug": _make_slug(text)}) if not headings: for m in re.finditer(r'^\*\*([^*]+)\*\*\s*$', md_text, re.MULTILINE): - text = m.group(1).strip().rstrip(':') + text = _plain_heading_text(m.group(1)).rstrip(':') if 3 < len(text) < 80: headings.append({"level": 2, "text": text, "slug": _make_slug(text)}) return headings +def _apply_heading_ids(report_html: str, headings: List[Dict[str, str]]) -> str: + """Force rendered h2/h3 IDs to match the generated sidebar links.""" + if not headings: + return report_html + + soup = BeautifulSoup(report_html, "html.parser") + rendered_headings = soup.find_all(["h2", "h3"]) + for element, heading in zip(rendered_headings, headings): + expected_name = f"h{heading['level']}" + if element.name != expected_name: + logger.debug( + "Visual report heading level mismatch: rendered %s for TOC %s", + element.name, + expected_name, + ) + element["id"] = heading["slug"] + if len(rendered_headings) != len(headings): + logger.debug( + "Visual report heading count mismatch: rendered=%s toc=%s", + len(rendered_headings), + len(headings), + ) + return str(soup) + + # Overlay buttons shown on each image: reroll (swap for the next unused # scraped image) + hide (remove and skip on future renders). Reroll is # wired up in the page script using the embedded spare-image pool. @@ -1650,13 +1691,8 @@ def generate_visual_report( report_html = _md_to_html(report_markdown) - # Add id anchors to h2/h3 for TOC linking headings = _extract_headings(report_markdown) - for h in headings: - tag = f"h{h['level']}" - pattern = rf'(<{tag}>)(.*?{re.escape(html.escape(h["text"]))}.*?)' - replacement = rf'<{tag} id="{h["slug"]}">\2' - report_html = re.sub(pattern, replacement, report_html, count=1) + report_html = _apply_heading_ids(report_html, headings) # Collect all OG images from sources (skip icons, tiny images, known junk) _IMAGE_BLOCKLIST = { diff --git a/tests/test_visual_report.py b/tests/test_visual_report.py new file mode 100644 index 0000000..41d6e3c --- /dev/null +++ b/tests/test_visual_report.py @@ -0,0 +1,38 @@ +from bs4 import BeautifulSoup + +from src.visual_report import generate_visual_report + + +def test_visual_report_toc_links_match_rendered_heading_ids(): + report = """ +# Automated Crypto Trading Bot Strategies + +### **1.0 Introduction & Research Scope** + +Intro body. + +### **2.0 Determining the "Best" Configuration** + +Configuration body. +""" + + html = generate_visual_report( + "crypto bot strategies", + report, + sources=[], + stats={}, + session_id="rp-test", + ) + soup = BeautifulSoup(html, "html.parser") + + links = soup.select(".toc-sidebar nav a") + assert [link.get_text(strip=True) for link in links] == [ + "1.0 Introduction & Research Scope", + '2.0 Determining the "Best" Configuration', + ] + + for link in links: + target_id = link["href"].removeprefix("#") + target = soup.find(id=target_id) + assert target is not None + assert target.name in {"h2", "h3"}