"""Content extraction behavior for the canonical services.search.content module.""" import pytest pytest.importorskip("bs4") from services.search import content as service_content class _FakeResponse: status_code = 200 headers = {"Content-Type": "text/html; charset=utf-8"} content = b"" def __init__(self, text: str): self.text = text def raise_for_status(self): return None @pytest.mark.parametrize("module", [service_content]) def test_content_fetcher_extracts_og_image_and_body_fallback(module, tmp_path, monkeypatch): html = """ Example
Tiny

This is the substantive body text that should be retained.

It is much longer than the tiny class-matched wrapper.

""" monkeypatch.setattr(module, "CONTENT_CACHE_DIR", tmp_path) module.content_cache_index.clear() monkeypatch.setattr(module, "_get_public_url", lambda url, headers, timeout: _FakeResponse(html)) result = module.fetch_webpage_content("https://example.com/parity-test") assert result["og_image"] == "https://example.com/cover.jpg" assert "substantive body text" in result["content"] assert "much longer than the tiny" in result["content"] assert "window.secret" not in result["content"]