Search: align service content extraction

Co-authored-by: ghreprimand <203024559+ghreprimand@users.noreply.github.com>
2026-06-02 06:53:07 -05:00
parent c85da91964
commit aa0a9e8b5a
2 changed files with 94 additions and 5 deletions
--- a/services/search/content.py
+++ b/services/search/content.py
@@ -1,5 +1,6 @@
 """Webpage content fetching with caching, PDF extraction, and summarization helpers."""

+import copy
 import io
 import ipaddress
 import json
@@ -115,6 +116,28 @@ def _extract_meta(soup: BeautifulSoup) -> dict:
    return {"description": description, "keywords": keywords}


+def _extract_og_image(soup: BeautifulSoup) -> str:
+    """Extract the best representative image URL from meta tags.
+
+    Only returns absolute http(s) URLs -- skips relative paths and data URIs.
+    """
+    candidates = []
+    for prop in ("og:image", "og:image:url", "og:image:secure_url"):
+        tag = soup.find("meta", attrs={"property": prop})
+        if tag and tag.get("content", "").strip():
+            candidates.append(tag["content"].strip())
+    tag = soup.find("meta", attrs={"name": "twitter:image"})
+    if tag and tag.get("content", "").strip():
+        candidates.append(tag["content"].strip())
+    tag = soup.find("meta", attrs={"name": "thumbnail"})
+    if tag and tag.get("content", "").strip():
+        candidates.append(tag["content"].strip())
+    for url in candidates:
+        if url.startswith(("https://", "http://")) and not url.endswith((".svg", ".ico")):
+            return url
+    return ""
+
+
 def _extract_lists(soup: BeautifulSoup) -> List[List[str]]:
    """Return a list of lists, each inner list representing a <ul>/<ol>."""
    all_lists = []
@@ -275,10 +298,12 @@ def fetch_webpage_content(url: str, timeout: int = 5, retry_attempt: int = 0) ->
    title_tag = soup.find("title")
    title_text = title_tag.get_text(strip=True) if title_tag else ""
    meta_info = _extract_meta(soup)
+    og_image = _extract_og_image(soup)
    js_rendered = _detect_js_frameworks(soup)
    js_message = "Page appears to be rendered by a JavaScript framework; content may be incomplete." if js_rendered else ""

-    # Main textual content (heuristic)
+    # Main textual content (heuristic): prefer semantic / "content"-classed
+    # containers to skip nav/footer/boilerplate; tuned for article pages.
    main_content = ""
    content_areas = soup.find_all(
        ["main", "article", "section", "div"],
@@ -287,12 +312,23 @@ def fetch_webpage_content(url: str, timeout: int = 5, retry_attempt: int = 0) ->
    if content_areas:
        for area in content_areas[:3]:
            main_content += area.get_text(separator=" ", strip=True) + " "
-    if not main_content:
+    main_content = re.sub(r"\s+", " ", main_content).strip()
+
+    # If the heuristic finds only a tiny wrapper, fall back to body text with
+    # obvious boilerplate stripped so UI/deep-research search results do not
+    # look empty for app/landing pages.
+    THIN_CONTENT_CHARS = 600
+    if len(main_content) < THIN_CONTENT_CHARS:
        body = soup.find("body")
        if body:
-            main_content = body.get_text(separator=" ", strip=True)
-
-    main_content = re.sub(r"\s+", " ", main_content).strip()[:8000]
+            body_copy = copy.copy(body)
+            for noise in body_copy.find_all(
+                ["script", "style", "noscript", "template", "nav", "header", "footer", "aside"]
+            ):
+                noise.extract()
+            body_text = re.sub(r"\s+", " ", body_copy.get_text(separator=" ", strip=True)).strip()
+            if len(body_text) > len(main_content):
+                main_content = body_text

    result = {
        "url": url,
@@ -303,6 +339,7 @@ def fetch_webpage_content(url: str, timeout: int = 5, retry_attempt: int = 0) ->
        "code_blocks": _extract_code_blocks(soup),
        "meta_description": meta_info.get("description", ""),
        "meta_keywords": meta_info.get("keywords", ""),
+        "og_image": og_image,
        "js_rendered": js_rendered,
        "js_message": js_message,
        "success": True,
--- a/tests/test_search_content_extraction_parity.py
+++ b/tests/test_search_content_extraction_parity.py
@@ -0,0 +1,52 @@
+"""Keep src.search and services.search content extraction behavior aligned."""
+
+import pytest
+
+pytest.importorskip("bs4")
+
+from services.search import content as service_content
+from src.search import content as src_content
+
+
+class _FakeResponse:
+    status_code = 200
+    headers = {"Content-Type": "text/html; charset=utf-8"}
+    content = b""
+
+    def __init__(self, text: str):
+        self.text = text
+
+    def raise_for_status(self):
+        return None
+
+
+@pytest.mark.parametrize("module", [src_content, service_content])
+def test_content_fetcher_extracts_og_image_and_body_fallback(module, tmp_path, monkeypatch):
+    html = """
+    <html>
+      <head>
+        <title>Example</title>
+        <meta property="og:image" content="https://example.com/cover.jpg">
+      </head>
+      <body>
+        <nav>Navigation text should not win</nav>
+        <div class="content">Tiny</div>
+        <main>
+          <p>This is the substantive body text that should be retained.</p>
+          <p>It is much longer than the tiny class-matched wrapper.</p>
+        </main>
+        <script>window.secret = "not content";</script>
+      </body>
+    </html>
+    """
+
+    monkeypatch.setattr(module, "CONTENT_CACHE_DIR", tmp_path)
+    module.content_cache_index.clear()
+    monkeypatch.setattr(module, "_get_public_url", lambda url, headers, timeout: _FakeResponse(html))
+
+    result = module.fetch_webpage_content("https://example.com/parity-test")
+
+    assert result["og_image"] == "https://example.com/cover.jpg"
+    assert "substantive body text" in result["content"]
+    assert "much longer than the tiny" in result["content"]
+    assert "window.secret" not in result["content"]