diff --git a/src/llm_core.py b/src/llm_core.py index 9ee4ca6..b255e5e 100644 --- a/src/llm_core.py +++ b/src/llm_core.py @@ -874,6 +874,31 @@ def llm_call(url: str, model: str, messages: List[Dict], temperature: float = LL raise HTTPException(502, f"Unexpected schema from {target_url}: {str(data)[:400]}") +def _dedupe_candidates(candidates): + """Filter malformed entries and drop a later repeat of an already-seen + ``(url, model)`` route, preserving order (first occurrence wins). + + The chain is the primary target followed by the configured fallbacks, so a + fallback that repeats the session's current model — a common misconfiguration, + since callers prepend the live ``(url, model)`` to ``default_model_fallbacks`` + — would otherwise make the chain re-attempt the very route that just failed: + a wasted round-trip plus a spurious ``fallback`` notice for a switch that did + not happen. Headers are not part of the key; the first tuple (with its + headers) is the one kept. + """ + seen = set() + out = [] + for c in candidates or []: + if not c or not c[0] or not c[1]: + continue + key = (c[0], c[1]) + if key in seen: + continue + seen.add(key) + out.append(c) + return out + + def llm_call_with_fallback(candidates, messages, **kwargs) -> str: """Sync `llm_call` with an ordered fallback chain. @@ -882,7 +907,7 @@ def llm_call_with_fallback(candidates, messages, **kwargs) -> str: the next candidate. The dead-host cooldown inside `llm_call` makes repeat attempts at an offline primary effectively free. """ - cands = [c for c in (candidates or []) if c and c[0] and c[1]] + cands = _dedupe_candidates(candidates) if not cands: raise HTTPException(503, "No model endpoint configured") last_err = None @@ -899,7 +924,7 @@ def llm_call_with_fallback(candidates, messages, **kwargs) -> str: async def llm_call_async_with_fallback(candidates, messages, **kwargs) -> str: """Async variant of `llm_call_with_fallback` — same semantics.""" - cands = [c for c in (candidates or []) if c and c[0] and c[1]] + cands = _dedupe_candidates(candidates) if not cands: raise HTTPException(503, "No model endpoint configured") last_err = None @@ -1436,7 +1461,7 @@ async def stream_llm_with_fallback(candidates, messages, **kwargs): Yields the same SSE chunk protocol as stream_llm. """ - cands = [c for c in (candidates or []) if c and c[0] and c[1]] + cands = _dedupe_candidates(candidates) if not cands: yield f'event: error\ndata: {json.dumps({"error": "No model endpoint configured", "status": 503})}\n\n' return diff --git a/tests/test_llm_core_fallback.py b/tests/test_llm_core_fallback.py index 9f30154..f1c4e6f 100644 --- a/tests/test_llm_core_fallback.py +++ b/tests/test_llm_core_fallback.py @@ -55,6 +55,44 @@ def test_no_fallback_event_when_primary_succeeds(monkeypatch): assert not any('"fallback"' in c for c in chunks) +def test_dedupe_candidates_keeps_first_of_each_route(): + """(url, model) is the route key; later repeats are dropped, order preserved, + the first tuple (with its headers) kept, malformed entries filtered.""" + cands = [ + ("u1", "m1", {"h": 1}), # first u1/m1 — kept + ("u1", "m1", {"h": 2}), # repeat route — dropped (first headers win) + ("u2", "m2", {}), # distinct — kept + ("u1", "m1", {}), # repeat again — dropped + (None, "x", {}), # malformed (no url) — dropped + ("u3", "", {}), # malformed (no model) — dropped + ] + assert llm_core._dedupe_candidates(cands) == [("u1", "m1", {"h": 1}), ("u2", "m2", {})] + assert llm_core._dedupe_candidates([]) == [] + assert llm_core._dedupe_candidates(None) == [] + + +def test_duplicate_route_is_attempted_only_once(monkeypatch): + """A fallback that repeats the primary's (url, model) must NOT make the chain + sail back into the same dead route — each distinct route is tried once.""" + calls = [] + + async def fake_stream(url, model, messages, **kw): + calls.append((url, model)) + yield 'event: error\ndata: {"status": 503, "text": "down"}\n\n' + + monkeypatch.setattr(llm_core, "stream_llm", fake_stream) + + async def run(): + out = [] + cands = [("u1", "m1", {}), ("u1", "m1", {}), ("u2", "m2", {})] + async for c in llm_core.stream_llm_with_fallback(cands, [{"role": "user", "content": "hi"}]): + out.append(c) + return out + + asyncio.run(run()) + assert calls == [("u1", "m1"), ("u2", "m2")], f"duplicate route re-attempted: {calls}" + + def test_summarize_stream_error(): assert "400" in llm_core._summarize_stream_error('event: error\ndata: {"status": 400, "text": "nope"}\n\n') assert llm_core._summarize_stream_error(None) == "primary model failed"