* chore: dedupe src/search/cache.py into a re-export shim src/search/cache.py was a byte-identical copy of services/search/cache.py. Convert it to a sys.modules alias of the canonical services module (matching src/search/core.py, providers.py, ranking.py) so the two cannot drift, and add an identity assertion to test_search_module_consolidation.py. content.py and query.py are intentionally left as-is: the copies have drifted and services lacks fixes that src has, so they need services reconciled first before they can be shimmed safely. * chore: dedupe src/search content.py and query.py into shims Convert src/search/content.py and query.py to sys.modules aliases of the canonical services/search/* (matching cache.py, core.py, providers.py, ranking.py) so the duplicate copies cannot drift. Repoint the two tests that were coupled to the src-copy internals onto the canonical services surface (behaviour is equivalent): - test_src_search_query_nonstring.py: import services.search.query instead of loading the src file by path. - test_security_regressions.py::test_web_fetch_guard_blocks_redirect_into_private: mock httpx.get (services uses the module-level get, not httpx.Client) and assert on the canonical 'Blocked' message. Drop the now-redundant [src_content, service_content] parametrization in test_search_content_extraction_parity.py and test_search_content_url_guards.py (after the shim both params are the same object); add content/query identity assertions to test_search_module_consolidation.py.
52 lines
1.6 KiB
Python
52 lines
1.6 KiB
Python
"""Content extraction behavior for the canonical services.search.content module."""
|
|
|
|
import pytest
|
|
|
|
pytest.importorskip("bs4")
|
|
|
|
from services.search import content as service_content
|
|
|
|
|
|
class _FakeResponse:
|
|
status_code = 200
|
|
headers = {"Content-Type": "text/html; charset=utf-8"}
|
|
content = b""
|
|
|
|
def __init__(self, text: str):
|
|
self.text = text
|
|
|
|
def raise_for_status(self):
|
|
return None
|
|
|
|
|
|
@pytest.mark.parametrize("module", [service_content])
|
|
def test_content_fetcher_extracts_og_image_and_body_fallback(module, tmp_path, monkeypatch):
|
|
html = """
|
|
<html>
|
|
<head>
|
|
<title>Example</title>
|
|
<meta property="og:image" content="https://example.com/cover.jpg">
|
|
</head>
|
|
<body>
|
|
<nav>Navigation text should not win</nav>
|
|
<div class="content">Tiny</div>
|
|
<main>
|
|
<p>This is the substantive body text that should be retained.</p>
|
|
<p>It is much longer than the tiny class-matched wrapper.</p>
|
|
</main>
|
|
<script>window.secret = "not content";</script>
|
|
</body>
|
|
</html>
|
|
"""
|
|
|
|
monkeypatch.setattr(module, "CONTENT_CACHE_DIR", tmp_path)
|
|
module.content_cache_index.clear()
|
|
monkeypatch.setattr(module, "_get_public_url", lambda url, headers, timeout: _FakeResponse(html))
|
|
|
|
result = module.fetch_webpage_content("https://example.com/parity-test")
|
|
|
|
assert result["og_image"] == "https://example.com/cover.jpg"
|
|
assert "substantive body text" in result["content"]
|
|
assert "much longer than the tiny" in result["content"]
|
|
assert "window.secret" not in result["content"]
|