From eae8797e0882efa2bd06cbc3160a192ec7bd1a4a Mon Sep 17 00:00:00 2001 From: Afonso Coutinho Date: Wed, 3 Jun 2026 06:22:55 +0100 Subject: [PATCH] fix: web search content blocks numbered by fetch completion order break citations (#1672) --- services/search/core.py | 21 ++++++- .../test_search_content_block_source_index.py | 61 +++++++++++++++++++ 2 files changed, 80 insertions(+), 2 deletions(-) create mode 100644 tests/test_search_content_block_source_index.py diff --git a/services/search/core.py b/services/search/core.py index 1230946..992022b 100644 --- a/services/search/core.py +++ b/services/search/core.py @@ -356,6 +356,12 @@ def comprehensive_web_search( for r in search_results if r.get("url") ] + # Map each URL to its [i] number in the sources list so fetched content + # blocks can be labeled with the SAME index the model cites. + _url_index = { + r["url"]: i for i, r in enumerate(search_results, 1) if r.get("url") + } + # Fetch content in parallel fetched_content = [] with ThreadPoolExecutor(max_workers=max_workers) as executor: @@ -368,6 +374,10 @@ def comprehensive_web_search( try: result = future.result() if result["success"] and result["content"] and len(result["content"]) >= min_content_length: + # Remember which source this fetch belongs to: redirects + # can change result["url"] and completion order is + # arbitrary, so the block label cannot be recomputed later. + result["source_index"] = _url_index.get(url) fetched_content.append(result) except Exception as e: logger.error(f"Exception while fetching {url}: {str(e)}") @@ -408,8 +418,15 @@ def comprehensive_web_search( output_parts.append("FETCHED PAGE CONTENT:") output_parts.append("-" * 50) - for i, content in enumerate(fetched_content, 1): - output_parts.append(f"\n[CONTENT {i}] From: {content['url']}") + # Emit blocks in source order, numbered with the same [i] as the + # sources list, so [CONTENT 2] really is content from source [2]. + # Before this, blocks were numbered 1..N in fetch COMPLETION order, + # which matched neither the sources list nor each other run to run. + fetched_content.sort(key=lambda c: c.get("source_index") or len(search_results) + 1) + for content in fetched_content: + _idx = content.get("source_index") + _label = f"[CONTENT {_idx}]" if _idx else "[CONTENT]" + output_parts.append(f"\n{_label} From: {content['url']}") output_parts.append(f"Title: {content['title']}") output_parts.append("-" * 30) diff --git a/tests/test_search_content_block_source_index.py b/tests/test_search_content_block_source_index.py new file mode 100644 index 0000000..8b28f9b --- /dev/null +++ b/tests/test_search_content_block_source_index.py @@ -0,0 +1,61 @@ +"""[CONTENT i] blocks must map to the [i] sources list. + +comprehensive_web_search numbers its sources list by search-result order, +but the fetched-content blocks were numbered 1..N in fetch COMPLETION +order (as_completed). With parallel fetching the two numberings disagree, +so the model cites "[2]" for content that actually came from source [3]. +""" + +import importlib +import time + +import pytest + + +@pytest.fixture +def core(monkeypatch): + mod = importlib.import_module("services.search.core") + results = [ + {"url": "http://one.example/a", "title": "One", "snippet": "s1"}, + {"url": "http://two.example/b", "title": "Two", "snippet": "s2"}, + ] + monkeypatch.setattr(mod, "_get_search_settings", lambda: {"search_provider": "searxng"}) + monkeypatch.setattr(mod, "_get_result_count", lambda: 2) + monkeypatch.setattr(mod, "_call_provider", lambda *a, **k: [dict(r) for r in results]) + monkeypatch.setattr(mod, "rank_search_results", lambda q, r: r) + return mod + + +def _fake_fetch_delaying_first(url, timeout=8, retry_attempt=0): + if "one.example" in url: + # Force the FIRST source to finish fetching LAST + time.sleep(0.4) + return { + "success": True, + "url": url, + "title": "Title for " + url, + "content": "Content for " + url + " " + "filler " * 20, + } + + +def test_content_blocks_numbered_by_source_not_completion_order(core, monkeypatch): + monkeypatch.setattr(core, "fetch_webpage_content", _fake_fetch_delaying_first) + out = core.comprehensive_web_search("test query", max_pages=2, max_workers=2) + assert "[CONTENT 1] From: http://one.example/a" in out + assert "[CONTENT 2] From: http://two.example/b" in out + assert out.index("[CONTENT 1]") < out.index("[CONTENT 2]") + + +def test_redirected_fetch_keeps_its_source_index(core, monkeypatch): + def fetch(url, timeout=8, retry_attempt=0): + final = "http://final.example/landing" if "two.example" in url else url + return { + "success": True, + "url": final, + "title": "Title", + "content": "Content for " + final + " " + "filler " * 20, + } + + monkeypatch.setattr(core, "fetch_webpage_content", fetch) + out = core.comprehensive_web_search("test query", max_pages=2, max_workers=2) + assert "[CONTENT 2] From: http://final.example/landing" in out