From 126e91e8b9c223ff21cd9d787787d3c1f68514ae Mon Sep 17 00:00:00 2001
From: lekt8 <lewistham9x@gmail.com>
Date: Wed, 3 Jun 2026 12:33:50 +0800
Subject: [PATCH] Don't attempt the same (url, model) route twice in the
 fallback chains (#1733)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The fallback helpers (llm_call_with_fallback, llm_call_async_with_fallback,
stream_llm_with_fallback) build their candidate list as the primary target
followed by the configured fallbacks. Callers prepend the session's live
(url, model) to default_model_fallbacks, so if the user also lists their current
model among the fallbacks — a common misconfiguration — the chain re-attempts
the very route that just failed: a wasted round-trip (and, for the streaming
path, a spurious 'fallback' notice for a switch that didn't actually happen).

Add a small _dedupe_candidates() helper that filters malformed entries and drops
a later repeat of an already-seen (url, model), preserving order (first wins,
keeping its headers). Apply it in all three fallback chains.

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/llm_core.py                 | 31 ++++++++++++++++++++++++---
 tests/test_llm_core_fallback.py | 38 +++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+), 3 deletions(-)

diff --git a/src/llm_core.py b/src/llm_core.py
index 9ee4ca6..b255e5e 100644
--- a/src/llm_core.py
+++ b/src/llm_core.py
@@ -874,6 +874,31 @@ def llm_call(url: str, model: str, messages: List[Dict], temperature: float = LL
         raise HTTPException(502, f"Unexpected schema from {target_url}: {str(data)[:400]}")
 
 
+def _dedupe_candidates(candidates):
+    """Filter malformed entries and drop a later repeat of an already-seen
+    ``(url, model)`` route, preserving order (first occurrence wins).
+
+    The chain is the primary target followed by the configured fallbacks, so a
+    fallback that repeats the session's current model — a common misconfiguration,
+    since callers prepend the live ``(url, model)`` to ``default_model_fallbacks``
+    — would otherwise make the chain re-attempt the very route that just failed:
+    a wasted round-trip plus a spurious ``fallback`` notice for a switch that did
+    not happen. Headers are not part of the key; the first tuple (with its
+    headers) is the one kept.
+    """
+    seen = set()
+    out = []
+    for c in candidates or []:
+        if not c or not c[0] or not c[1]:
+            continue
+        key = (c[0], c[1])
+        if key in seen:
+            continue
+        seen.add(key)
+        out.append(c)
+    return out
+
+
 def llm_call_with_fallback(candidates, messages, **kwargs) -> str:
     """Sync `llm_call` with an ordered fallback chain.
 
@@ -882,7 +907,7 @@ def llm_call_with_fallback(candidates, messages, **kwargs) -> str:
     the next candidate. The dead-host cooldown inside `llm_call` makes repeat
     attempts at an offline primary effectively free.
     """
-    cands = [c for c in (candidates or []) if c and c[0] and c[1]]
+    cands = _dedupe_candidates(candidates)
     if not cands:
         raise HTTPException(503, "No model endpoint configured")
     last_err = None
@@ -899,7 +924,7 @@ def llm_call_with_fallback(candidates, messages, **kwargs) -> str:
 
 async def llm_call_async_with_fallback(candidates, messages, **kwargs) -> str:
     """Async variant of `llm_call_with_fallback` — same semantics."""
-    cands = [c for c in (candidates or []) if c and c[0] and c[1]]
+    cands = _dedupe_candidates(candidates)
     if not cands:
         raise HTTPException(503, "No model endpoint configured")
     last_err = None
@@ -1436,7 +1461,7 @@ async def stream_llm_with_fallback(candidates, messages, **kwargs):
 
     Yields the same SSE chunk protocol as stream_llm.
     """
-    cands = [c for c in (candidates or []) if c and c[0] and c[1]]
+    cands = _dedupe_candidates(candidates)
     if not cands:
         yield f'event: error\ndata: {json.dumps({"error": "No model endpoint configured", "status": 503})}\n\n'
         return
diff --git a/tests/test_llm_core_fallback.py b/tests/test_llm_core_fallback.py
index 9f30154..f1c4e6f 100644
--- a/tests/test_llm_core_fallback.py
+++ b/tests/test_llm_core_fallback.py
@@ -55,6 +55,44 @@ def test_no_fallback_event_when_primary_succeeds(monkeypatch):
     assert not any('"fallback"' in c for c in chunks)
 
 
+def test_dedupe_candidates_keeps_first_of_each_route():
+    """(url, model) is the route key; later repeats are dropped, order preserved,
+    the first tuple (with its headers) kept, malformed entries filtered."""
+    cands = [
+        ("u1", "m1", {"h": 1}),   # first u1/m1 — kept
+        ("u1", "m1", {"h": 2}),   # repeat route — dropped (first headers win)
+        ("u2", "m2", {}),         # distinct — kept
+        ("u1", "m1", {}),         # repeat again — dropped
+        (None, "x", {}),          # malformed (no url) — dropped
+        ("u3", "", {}),           # malformed (no model) — dropped
+    ]
+    assert llm_core._dedupe_candidates(cands) == [("u1", "m1", {"h": 1}), ("u2", "m2", {})]
+    assert llm_core._dedupe_candidates([]) == []
+    assert llm_core._dedupe_candidates(None) == []
+
+
+def test_duplicate_route_is_attempted_only_once(monkeypatch):
+    """A fallback that repeats the primary's (url, model) must NOT make the chain
+    sail back into the same dead route — each distinct route is tried once."""
+    calls = []
+
+    async def fake_stream(url, model, messages, **kw):
+        calls.append((url, model))
+        yield 'event: error\ndata: {"status": 503, "text": "down"}\n\n'
+
+    monkeypatch.setattr(llm_core, "stream_llm", fake_stream)
+
+    async def run():
+        out = []
+        cands = [("u1", "m1", {}), ("u1", "m1", {}), ("u2", "m2", {})]
+        async for c in llm_core.stream_llm_with_fallback(cands, [{"role": "user", "content": "hi"}]):
+            out.append(c)
+        return out
+
+    asyncio.run(run())
+    assert calls == [("u1", "m1"), ("u2", "m2")], f"duplicate route re-attempted: {calls}"
+
+
 def test_summarize_stream_error():
     assert "400" in llm_core._summarize_stream_error('event: error\ndata: {"status": 400, "text": "nope"}\n\n')
     assert llm_core._summarize_stream_error(None) == "primary model failed"