fix: web search content blocks numbered by fetch completion order break citations (#1672)
This commit is contained in:
@@ -356,6 +356,12 @@ def comprehensive_web_search(
|
|||||||
for r in search_results if r.get("url")
|
for r in search_results if r.get("url")
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Map each URL to its [i] number in the sources list so fetched content
|
||||||
|
# blocks can be labeled with the SAME index the model cites.
|
||||||
|
_url_index = {
|
||||||
|
r["url"]: i for i, r in enumerate(search_results, 1) if r.get("url")
|
||||||
|
}
|
||||||
|
|
||||||
# Fetch content in parallel
|
# Fetch content in parallel
|
||||||
fetched_content = []
|
fetched_content = []
|
||||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||||
@@ -368,6 +374,10 @@ def comprehensive_web_search(
|
|||||||
try:
|
try:
|
||||||
result = future.result()
|
result = future.result()
|
||||||
if result["success"] and result["content"] and len(result["content"]) >= min_content_length:
|
if result["success"] and result["content"] and len(result["content"]) >= min_content_length:
|
||||||
|
# Remember which source this fetch belongs to: redirects
|
||||||
|
# can change result["url"] and completion order is
|
||||||
|
# arbitrary, so the block label cannot be recomputed later.
|
||||||
|
result["source_index"] = _url_index.get(url)
|
||||||
fetched_content.append(result)
|
fetched_content.append(result)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Exception while fetching {url}: {str(e)}")
|
logger.error(f"Exception while fetching {url}: {str(e)}")
|
||||||
@@ -408,8 +418,15 @@ def comprehensive_web_search(
|
|||||||
output_parts.append("FETCHED PAGE CONTENT:")
|
output_parts.append("FETCHED PAGE CONTENT:")
|
||||||
output_parts.append("-" * 50)
|
output_parts.append("-" * 50)
|
||||||
|
|
||||||
for i, content in enumerate(fetched_content, 1):
|
# Emit blocks in source order, numbered with the same [i] as the
|
||||||
output_parts.append(f"\n[CONTENT {i}] From: {content['url']}")
|
# sources list, so [CONTENT 2] really is content from source [2].
|
||||||
|
# Before this, blocks were numbered 1..N in fetch COMPLETION order,
|
||||||
|
# which matched neither the sources list nor each other run to run.
|
||||||
|
fetched_content.sort(key=lambda c: c.get("source_index") or len(search_results) + 1)
|
||||||
|
for content in fetched_content:
|
||||||
|
_idx = content.get("source_index")
|
||||||
|
_label = f"[CONTENT {_idx}]" if _idx else "[CONTENT]"
|
||||||
|
output_parts.append(f"\n{_label} From: {content['url']}")
|
||||||
output_parts.append(f"Title: {content['title']}")
|
output_parts.append(f"Title: {content['title']}")
|
||||||
output_parts.append("-" * 30)
|
output_parts.append("-" * 30)
|
||||||
|
|
||||||
|
|||||||
61
tests/test_search_content_block_source_index.py
Normal file
61
tests/test_search_content_block_source_index.py
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
"""[CONTENT i] blocks must map to the [i] sources list.
|
||||||
|
|
||||||
|
comprehensive_web_search numbers its sources list by search-result order,
|
||||||
|
but the fetched-content blocks were numbered 1..N in fetch COMPLETION
|
||||||
|
order (as_completed). With parallel fetching the two numberings disagree,
|
||||||
|
so the model cites "[2]" for content that actually came from source [3].
|
||||||
|
"""
|
||||||
|
|
||||||
|
import importlib
|
||||||
|
import time
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def core(monkeypatch):
|
||||||
|
mod = importlib.import_module("services.search.core")
|
||||||
|
results = [
|
||||||
|
{"url": "http://one.example/a", "title": "One", "snippet": "s1"},
|
||||||
|
{"url": "http://two.example/b", "title": "Two", "snippet": "s2"},
|
||||||
|
]
|
||||||
|
monkeypatch.setattr(mod, "_get_search_settings", lambda: {"search_provider": "searxng"})
|
||||||
|
monkeypatch.setattr(mod, "_get_result_count", lambda: 2)
|
||||||
|
monkeypatch.setattr(mod, "_call_provider", lambda *a, **k: [dict(r) for r in results])
|
||||||
|
monkeypatch.setattr(mod, "rank_search_results", lambda q, r: r)
|
||||||
|
return mod
|
||||||
|
|
||||||
|
|
||||||
|
def _fake_fetch_delaying_first(url, timeout=8, retry_attempt=0):
|
||||||
|
if "one.example" in url:
|
||||||
|
# Force the FIRST source to finish fetching LAST
|
||||||
|
time.sleep(0.4)
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"url": url,
|
||||||
|
"title": "Title for " + url,
|
||||||
|
"content": "Content for " + url + " " + "filler " * 20,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_content_blocks_numbered_by_source_not_completion_order(core, monkeypatch):
|
||||||
|
monkeypatch.setattr(core, "fetch_webpage_content", _fake_fetch_delaying_first)
|
||||||
|
out = core.comprehensive_web_search("test query", max_pages=2, max_workers=2)
|
||||||
|
assert "[CONTENT 1] From: http://one.example/a" in out
|
||||||
|
assert "[CONTENT 2] From: http://two.example/b" in out
|
||||||
|
assert out.index("[CONTENT 1]") < out.index("[CONTENT 2]")
|
||||||
|
|
||||||
|
|
||||||
|
def test_redirected_fetch_keeps_its_source_index(core, monkeypatch):
|
||||||
|
def fetch(url, timeout=8, retry_attempt=0):
|
||||||
|
final = "http://final.example/landing" if "two.example" in url else url
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"url": final,
|
||||||
|
"title": "Title",
|
||||||
|
"content": "Content for " + final + " " + "filler " * 20,
|
||||||
|
}
|
||||||
|
|
||||||
|
monkeypatch.setattr(core, "fetch_webpage_content", fetch)
|
||||||
|
out = core.comprehensive_web_search("test query", max_pages=2, max_workers=2)
|
||||||
|
assert "[CONTENT 2] From: http://final.example/landing" in out
|
||||||
Reference in New Issue
Block a user