Don't attempt the same (url, model) route twice in the fallback chains (#1733)
The fallback helpers (llm_call_with_fallback, llm_call_async_with_fallback, stream_llm_with_fallback) build their candidate list as the primary target followed by the configured fallbacks. Callers prepend the session's live (url, model) to default_model_fallbacks, so if the user also lists their current model among the fallbacks — a common misconfiguration — the chain re-attempts the very route that just failed: a wasted round-trip (and, for the streaming path, a spurious 'fallback' notice for a switch that didn't actually happen). Add a small _dedupe_candidates() helper that filters malformed entries and drops a later repeat of an already-seen (url, model), preserving order (first wins, keeping its headers). Apply it in all three fallback chains. Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -874,6 +874,31 @@ def llm_call(url: str, model: str, messages: List[Dict], temperature: float = LL
|
|||||||
raise HTTPException(502, f"Unexpected schema from {target_url}: {str(data)[:400]}")
|
raise HTTPException(502, f"Unexpected schema from {target_url}: {str(data)[:400]}")
|
||||||
|
|
||||||
|
|
||||||
|
def _dedupe_candidates(candidates):
|
||||||
|
"""Filter malformed entries and drop a later repeat of an already-seen
|
||||||
|
``(url, model)`` route, preserving order (first occurrence wins).
|
||||||
|
|
||||||
|
The chain is the primary target followed by the configured fallbacks, so a
|
||||||
|
fallback that repeats the session's current model — a common misconfiguration,
|
||||||
|
since callers prepend the live ``(url, model)`` to ``default_model_fallbacks``
|
||||||
|
— would otherwise make the chain re-attempt the very route that just failed:
|
||||||
|
a wasted round-trip plus a spurious ``fallback`` notice for a switch that did
|
||||||
|
not happen. Headers are not part of the key; the first tuple (with its
|
||||||
|
headers) is the one kept.
|
||||||
|
"""
|
||||||
|
seen = set()
|
||||||
|
out = []
|
||||||
|
for c in candidates or []:
|
||||||
|
if not c or not c[0] or not c[1]:
|
||||||
|
continue
|
||||||
|
key = (c[0], c[1])
|
||||||
|
if key in seen:
|
||||||
|
continue
|
||||||
|
seen.add(key)
|
||||||
|
out.append(c)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
def llm_call_with_fallback(candidates, messages, **kwargs) -> str:
|
def llm_call_with_fallback(candidates, messages, **kwargs) -> str:
|
||||||
"""Sync `llm_call` with an ordered fallback chain.
|
"""Sync `llm_call` with an ordered fallback chain.
|
||||||
|
|
||||||
@@ -882,7 +907,7 @@ def llm_call_with_fallback(candidates, messages, **kwargs) -> str:
|
|||||||
the next candidate. The dead-host cooldown inside `llm_call` makes repeat
|
the next candidate. The dead-host cooldown inside `llm_call` makes repeat
|
||||||
attempts at an offline primary effectively free.
|
attempts at an offline primary effectively free.
|
||||||
"""
|
"""
|
||||||
cands = [c for c in (candidates or []) if c and c[0] and c[1]]
|
cands = _dedupe_candidates(candidates)
|
||||||
if not cands:
|
if not cands:
|
||||||
raise HTTPException(503, "No model endpoint configured")
|
raise HTTPException(503, "No model endpoint configured")
|
||||||
last_err = None
|
last_err = None
|
||||||
@@ -899,7 +924,7 @@ def llm_call_with_fallback(candidates, messages, **kwargs) -> str:
|
|||||||
|
|
||||||
async def llm_call_async_with_fallback(candidates, messages, **kwargs) -> str:
|
async def llm_call_async_with_fallback(candidates, messages, **kwargs) -> str:
|
||||||
"""Async variant of `llm_call_with_fallback` — same semantics."""
|
"""Async variant of `llm_call_with_fallback` — same semantics."""
|
||||||
cands = [c for c in (candidates or []) if c and c[0] and c[1]]
|
cands = _dedupe_candidates(candidates)
|
||||||
if not cands:
|
if not cands:
|
||||||
raise HTTPException(503, "No model endpoint configured")
|
raise HTTPException(503, "No model endpoint configured")
|
||||||
last_err = None
|
last_err = None
|
||||||
@@ -1436,7 +1461,7 @@ async def stream_llm_with_fallback(candidates, messages, **kwargs):
|
|||||||
|
|
||||||
Yields the same SSE chunk protocol as stream_llm.
|
Yields the same SSE chunk protocol as stream_llm.
|
||||||
"""
|
"""
|
||||||
cands = [c for c in (candidates or []) if c and c[0] and c[1]]
|
cands = _dedupe_candidates(candidates)
|
||||||
if not cands:
|
if not cands:
|
||||||
yield f'event: error\ndata: {json.dumps({"error": "No model endpoint configured", "status": 503})}\n\n'
|
yield f'event: error\ndata: {json.dumps({"error": "No model endpoint configured", "status": 503})}\n\n'
|
||||||
return
|
return
|
||||||
|
|||||||
@@ -55,6 +55,44 @@ def test_no_fallback_event_when_primary_succeeds(monkeypatch):
|
|||||||
assert not any('"fallback"' in c for c in chunks)
|
assert not any('"fallback"' in c for c in chunks)
|
||||||
|
|
||||||
|
|
||||||
|
def test_dedupe_candidates_keeps_first_of_each_route():
|
||||||
|
"""(url, model) is the route key; later repeats are dropped, order preserved,
|
||||||
|
the first tuple (with its headers) kept, malformed entries filtered."""
|
||||||
|
cands = [
|
||||||
|
("u1", "m1", {"h": 1}), # first u1/m1 — kept
|
||||||
|
("u1", "m1", {"h": 2}), # repeat route — dropped (first headers win)
|
||||||
|
("u2", "m2", {}), # distinct — kept
|
||||||
|
("u1", "m1", {}), # repeat again — dropped
|
||||||
|
(None, "x", {}), # malformed (no url) — dropped
|
||||||
|
("u3", "", {}), # malformed (no model) — dropped
|
||||||
|
]
|
||||||
|
assert llm_core._dedupe_candidates(cands) == [("u1", "m1", {"h": 1}), ("u2", "m2", {})]
|
||||||
|
assert llm_core._dedupe_candidates([]) == []
|
||||||
|
assert llm_core._dedupe_candidates(None) == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_duplicate_route_is_attempted_only_once(monkeypatch):
|
||||||
|
"""A fallback that repeats the primary's (url, model) must NOT make the chain
|
||||||
|
sail back into the same dead route — each distinct route is tried once."""
|
||||||
|
calls = []
|
||||||
|
|
||||||
|
async def fake_stream(url, model, messages, **kw):
|
||||||
|
calls.append((url, model))
|
||||||
|
yield 'event: error\ndata: {"status": 503, "text": "down"}\n\n'
|
||||||
|
|
||||||
|
monkeypatch.setattr(llm_core, "stream_llm", fake_stream)
|
||||||
|
|
||||||
|
async def run():
|
||||||
|
out = []
|
||||||
|
cands = [("u1", "m1", {}), ("u1", "m1", {}), ("u2", "m2", {})]
|
||||||
|
async for c in llm_core.stream_llm_with_fallback(cands, [{"role": "user", "content": "hi"}]):
|
||||||
|
out.append(c)
|
||||||
|
return out
|
||||||
|
|
||||||
|
asyncio.run(run())
|
||||||
|
assert calls == [("u1", "m1"), ("u2", "m2")], f"duplicate route re-attempted: {calls}"
|
||||||
|
|
||||||
|
|
||||||
def test_summarize_stream_error():
|
def test_summarize_stream_error():
|
||||||
assert "400" in llm_core._summarize_stream_error('event: error\ndata: {"status": 400, "text": "nope"}\n\n')
|
assert "400" in llm_core._summarize_stream_error('event: error\ndata: {"status": 400, "text": "nope"}\n\n')
|
||||||
assert llm_core._summarize_stream_error(None) == "primary model failed"
|
assert llm_core._summarize_stream_error(None) == "primary model failed"
|
||||||
|
|||||||
Reference in New Issue
Block a user