Fix visual report chapter navigation (#505)
Co-authored-by: Alex Kenley <Alex.Kenley@threatvectorsecurity.com>
This commit is contained in:
@@ -19,6 +19,8 @@ import re
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Dict, List, Optional, Tuple
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from src.research_utils import strip_thinking
|
from src.research_utils import strip_thinking
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
@@ -68,8 +70,20 @@ def _extract_headings(md_text: str) -> List[Dict[str, str]]:
|
|||||||
headings = []
|
headings = []
|
||||||
seen_slugs: Dict[str, int] = {}
|
seen_slugs: Dict[str, int] = {}
|
||||||
|
|
||||||
|
def _plain_heading_text(text: str) -> str:
|
||||||
|
text = text.strip().rstrip("#").strip()
|
||||||
|
text = re.sub(r'!\[([^\]]*)\]\([^)]+\)', r'\1', text)
|
||||||
|
text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
|
||||||
|
text = re.sub(r'\[([^\]]+)\]\[[^\]]+\]', r'\1', text)
|
||||||
|
text = re.sub(r'<[^>]+>', '', text)
|
||||||
|
text = re.sub(r'[`*_~]+', '', text)
|
||||||
|
text = html.unescape(text)
|
||||||
|
return re.sub(r'\s+', ' ', text).strip()
|
||||||
|
|
||||||
def _make_slug(text: str) -> str:
|
def _make_slug(text: str) -> str:
|
||||||
slug = re.sub(r'[^a-z0-9]+', '-', text.lower()).strip('-')
|
slug = re.sub(r'[^a-z0-9]+', '-', text.lower()).strip('-')
|
||||||
|
if not slug:
|
||||||
|
slug = "section"
|
||||||
if slug in seen_slugs:
|
if slug in seen_slugs:
|
||||||
seen_slugs[slug] += 1
|
seen_slugs[slug] += 1
|
||||||
slug = f"{slug}-{seen_slugs[slug]}"
|
slug = f"{slug}-{seen_slugs[slug]}"
|
||||||
@@ -79,16 +93,43 @@ def _extract_headings(md_text: str) -> List[Dict[str, str]]:
|
|||||||
|
|
||||||
for m in re.finditer(r'^(#{2,3})\s+(.+)$', md_text, re.MULTILINE):
|
for m in re.finditer(r'^(#{2,3})\s+(.+)$', md_text, re.MULTILINE):
|
||||||
level = len(m.group(1))
|
level = len(m.group(1))
|
||||||
text = m.group(2).strip()
|
text = _plain_heading_text(m.group(2))
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
headings.append({"level": level, "text": text, "slug": _make_slug(text)})
|
headings.append({"level": level, "text": text, "slug": _make_slug(text)})
|
||||||
if not headings:
|
if not headings:
|
||||||
for m in re.finditer(r'^\*\*([^*]+)\*\*\s*$', md_text, re.MULTILINE):
|
for m in re.finditer(r'^\*\*([^*]+)\*\*\s*$', md_text, re.MULTILINE):
|
||||||
text = m.group(1).strip().rstrip(':')
|
text = _plain_heading_text(m.group(1)).rstrip(':')
|
||||||
if 3 < len(text) < 80:
|
if 3 < len(text) < 80:
|
||||||
headings.append({"level": 2, "text": text, "slug": _make_slug(text)})
|
headings.append({"level": 2, "text": text, "slug": _make_slug(text)})
|
||||||
return headings
|
return headings
|
||||||
|
|
||||||
|
|
||||||
|
def _apply_heading_ids(report_html: str, headings: List[Dict[str, str]]) -> str:
|
||||||
|
"""Force rendered h2/h3 IDs to match the generated sidebar links."""
|
||||||
|
if not headings:
|
||||||
|
return report_html
|
||||||
|
|
||||||
|
soup = BeautifulSoup(report_html, "html.parser")
|
||||||
|
rendered_headings = soup.find_all(["h2", "h3"])
|
||||||
|
for element, heading in zip(rendered_headings, headings):
|
||||||
|
expected_name = f"h{heading['level']}"
|
||||||
|
if element.name != expected_name:
|
||||||
|
logger.debug(
|
||||||
|
"Visual report heading level mismatch: rendered %s for TOC %s",
|
||||||
|
element.name,
|
||||||
|
expected_name,
|
||||||
|
)
|
||||||
|
element["id"] = heading["slug"]
|
||||||
|
if len(rendered_headings) != len(headings):
|
||||||
|
logger.debug(
|
||||||
|
"Visual report heading count mismatch: rendered=%s toc=%s",
|
||||||
|
len(rendered_headings),
|
||||||
|
len(headings),
|
||||||
|
)
|
||||||
|
return str(soup)
|
||||||
|
|
||||||
|
|
||||||
# Overlay buttons shown on each image: reroll (swap for the next unused
|
# Overlay buttons shown on each image: reroll (swap for the next unused
|
||||||
# scraped image) + hide (remove and skip on future renders). Reroll is
|
# scraped image) + hide (remove and skip on future renders). Reroll is
|
||||||
# wired up in the page script using the embedded spare-image pool.
|
# wired up in the page script using the embedded spare-image pool.
|
||||||
@@ -1650,13 +1691,8 @@ def generate_visual_report(
|
|||||||
|
|
||||||
report_html = _md_to_html(report_markdown)
|
report_html = _md_to_html(report_markdown)
|
||||||
|
|
||||||
# Add id anchors to h2/h3 for TOC linking
|
|
||||||
headings = _extract_headings(report_markdown)
|
headings = _extract_headings(report_markdown)
|
||||||
for h in headings:
|
report_html = _apply_heading_ids(report_html, headings)
|
||||||
tag = f"h{h['level']}"
|
|
||||||
pattern = rf'(<{tag}>)(.*?{re.escape(html.escape(h["text"]))}.*?</{tag}>)'
|
|
||||||
replacement = rf'<{tag} id="{h["slug"]}">\2'
|
|
||||||
report_html = re.sub(pattern, replacement, report_html, count=1)
|
|
||||||
|
|
||||||
# Collect all OG images from sources (skip icons, tiny images, known junk)
|
# Collect all OG images from sources (skip icons, tiny images, known junk)
|
||||||
_IMAGE_BLOCKLIST = {
|
_IMAGE_BLOCKLIST = {
|
||||||
|
|||||||
38
tests/test_visual_report.py
Normal file
38
tests/test_visual_report.py
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from src.visual_report import generate_visual_report
|
||||||
|
|
||||||
|
|
||||||
|
def test_visual_report_toc_links_match_rendered_heading_ids():
|
||||||
|
report = """
|
||||||
|
# Automated Crypto Trading Bot Strategies
|
||||||
|
|
||||||
|
### **1.0 Introduction & Research Scope**
|
||||||
|
|
||||||
|
Intro body.
|
||||||
|
|
||||||
|
### **2.0 Determining the "Best" Configuration**
|
||||||
|
|
||||||
|
Configuration body.
|
||||||
|
"""
|
||||||
|
|
||||||
|
html = generate_visual_report(
|
||||||
|
"crypto bot strategies",
|
||||||
|
report,
|
||||||
|
sources=[],
|
||||||
|
stats={},
|
||||||
|
session_id="rp-test",
|
||||||
|
)
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
|
||||||
|
links = soup.select(".toc-sidebar nav a")
|
||||||
|
assert [link.get_text(strip=True) for link in links] == [
|
||||||
|
"1.0 Introduction & Research Scope",
|
||||||
|
'2.0 Determining the "Best" Configuration',
|
||||||
|
]
|
||||||
|
|
||||||
|
for link in links:
|
||||||
|
target_id = link["href"].removeprefix("#")
|
||||||
|
target = soup.find(id=target_id)
|
||||||
|
assert target is not None
|
||||||
|
assert target.name in {"h2", "h3"}
|
||||||
Reference in New Issue
Block a user