From 6776c7d69105b78bc1865100f9f3d0e4c90acb35 Mon Sep 17 00:00:00 2001
From: James Arslan <james.arslan@tagd.ai>
Date: Tue, 2 Jun 2026 04:37:25 +0200
Subject: [PATCH] Surface silent model fallback instead of masking it (#868)

When the selected model fails before producing output, stream_llm_with_fallback
quietly switches to the next candidate and the reply is shown under the
originally selected model's name, so a misconfigured provider looks like it
works. (Concretely: a Bedrock gateway that 400s every Anthropic/Claude request
appears fine because another model silently answers under the Claude label.)

Emit a `fallback` SSE event ({selected_model, answered_by, reason}) the first
time a non-primary candidate produces output, forward it through the agent loop
and both chat-route paths, stamp the response metrics with the model that
actually answered, and show a notice + relabel the reply in the UI.

Tested: python -m pytest tests/test_llm_core_fallback.py (3 pass);
python -m py_compile src/llm_core.py src/agent_loop.py routes/chat_routes.py;
node --check static/js/chat.js.
---
 routes/chat_routes.py           | 18 ++++++++--
 src/agent_loop.py               |  6 ++++
 src/llm_core.py                 | 32 +++++++++++++++++
 static/js/chat.js               | 20 +++++++++++
 tests/test_llm_core_fallback.py | 61 +++++++++++++++++++++++++++++++++
 5 files changed, 135 insertions(+), 2 deletions(-)
 create mode 100644 tests/test_llm_core_fallback.py

diff --git a/routes/chat_routes.py b/routes/chat_routes.py
index b2e9dd5..84143d0 100644
--- a/routes/chat_routes.py
+++ b/routes/chat_routes.py
@@ -769,6 +769,7 @@ def setup_chat_routes(
                 return
             elif chat_mode == "chat":
                 _chat_start = time.time()
+                _answered_by = None  # set if the selected model failed and a fallback answered
                 # ── Chat mode: call stream_llm directly, NO tools, NO document access ──
                 try:
                     _chat_candidates = [(sess.endpoint_url, sess.model, sess.headers)] + _fallback_candidates
@@ -797,9 +798,14 @@ def setup_chat_routes(
                                         full_response += data["delta"]
                                         _stream_set(session, partial=full_response)
                                     yield chunk
+                                elif data.get("type") == "fallback":
+                                    # Selected model failed; a fallback answered.
+                                    # Forward the notice and remember the real model.
+                                    _answered_by = data.get("answered_by") or _answered_by
+                                    yield chunk
                                 elif data.get("type") == "usage":
                                     last_metrics = data.get("data", {})
-                                    last_metrics["model"] = sess.model
+                                    last_metrics["model"] = _answered_by or sess.model
                                     if ctx.context_length and last_metrics.get("input_tokens"):
                                         pct = min(round((last_metrics["input_tokens"] / ctx.context_length) * 100, 1), 100.0)
                                         last_metrics["context_percent"] = pct
@@ -867,6 +873,7 @@ def setup_chat_routes(
                 # ── Agent mode: full agent loop with tools ──
                 _agent_rounds = 0
                 _agent_tool_calls = 0
+                _answered_by = None  # set if the selected model failed and a fallback answered
                 try:
                     from src.settings import get_setting
                     _tool_budget = int(get_setting("agent_max_tool_calls", 0))
@@ -911,9 +918,16 @@ def setup_chat_routes(
                                     elif data.get("type") == "tool_start":
                                         _agent_tool_calls += 1
                                     yield chunk
+                                elif data.get("type") == "fallback":
+                                    # Selected model failed; a fallback answered.
+                                    # Forward the notice and remember the real
+                                    # model so metrics reflect it, not the masked
+                                    # selected model.
+                                    _answered_by = data.get("answered_by") or _answered_by
+                                    yield chunk
                                 elif data.get("type") == "metrics":
                                     last_metrics = data.get("data", {})
-                                    last_metrics["model"] = sess.model
+                                    last_metrics["model"] = _answered_by or sess.model
                                     yield f'data: {json.dumps({"type": "metrics", "data": last_metrics})}\n\n'
                             except json.JSONDecodeError:
                                 yield chunk
diff --git a/src/agent_loop.py b/src/agent_loop.py
index b72a855..9c25e59 100644
--- a/src/agent_loop.py
+++ b/src/agent_loop.py
@@ -1638,6 +1638,12 @@ async def stream_agent_loop(
                         real_output_tokens += u.get("output_tokens", 0)
                         last_round_input_tokens = round_input
                         has_real_usage = True
+                    elif data.get("type") == "fallback":
+                        # The selected model failed and another answered; surface
+                        # the notice so a misconfigured provider isn't masked.
+                        logger.warning(f"[agent] round {round_num} fell back: "
+                                       f"{data.get('selected_model')} -> {data.get('answered_by')}")
+                        yield chunk
                     elif "delta" in data:
                         if not first_token_received:
                             time_to_first_token = time.time() - total_start
diff --git a/src/llm_core.py b/src/llm_core.py
index 8e22b96..1ca4bf6 100644
--- a/src/llm_core.py
+++ b/src/llm_core.py
@@ -1148,6 +1148,24 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl
         yield f'event: error\ndata: {json.dumps({"error": str(e), "status": 502})}\n\n'
 
 
+def _summarize_stream_error(err_chunk: Optional[str]) -> str:
+    """Pull a short human reason out of an `event: error` SSE chunk for the
+    fallback notice. Returns a generic message if it can't be parsed."""
+    if not err_chunk:
+        return "primary model failed"
+    try:
+        for line in err_chunk.split("\n"):
+            if line.startswith("data: "):
+                j = json.loads(line[6:])
+                txt = j.get("text") or j.get("error") or ""
+                status = j.get("status")
+                msg = (f"HTTP {status}: " if status else "") + str(txt)
+                return msg[:200].strip() or "primary model failed"
+    except Exception:
+        pass
+    return "primary model failed"
+
+
 async def stream_llm_with_fallback(candidates, messages, **kwargs):
     """Wrap stream_llm with an ordered fallback chain.
 
@@ -1166,6 +1184,7 @@ async def stream_llm_with_fallback(candidates, messages, **kwargs):
         yield f'event: error\ndata: {json.dumps({"error": "No model endpoint configured", "status": 503})}\n\n'
         return
 
+    primary_model = cands[0][1]
     last_error = None
     for i, (url, model, headers) in enumerate(cands):
         is_last = (i == len(cands) - 1)
@@ -1187,6 +1206,19 @@ async def stream_llm_with_fallback(candidates, messages, **kwargs):
                 continue
             # Any data chunk other than the terminal [DONE] means real output.
             if chunk.startswith("data: ") and not chunk.startswith("data: [DONE]"):
+                # First real output from a NON-primary candidate: tell the client
+                # the selected model failed and another answered. Without this the
+                # fallback is invisible — a misconfigured provider looks like it
+                # works because the reply is shown under the originally selected
+                # model's name (e.g. a Bedrock/Claude endpoint that 400s every
+                # request but appears fine because another model silently answered).
+                if not emitted and i > 0:
+                    yield ('data: ' + json.dumps({
+                        "type": "fallback",
+                        "selected_model": primary_model,
+                        "answered_by": model,
+                        "reason": _summarize_stream_error(last_error),
+                    }) + '\n\n')
                 emitted = True
             yield chunk
         if not retried:
diff --git a/static/js/chat.js b/static/js/chat.js
index 3448686..4a7632c 100644
--- a/static/js/chat.js
+++ b/static/js/chat.js
@@ -1771,6 +1771,26 @@ import createResearchSynapse from './researchSynapse.js';
                     if (tsSpan) roleEl.appendChild(tsSpan);
                   }
                 }
+              } else if (json.type === 'fallback') {
+                // The selected model failed and another provider answered. Make
+                // it visible so a misconfigured provider is never silently
+                // masked under the selected model's name.
+                if (!_isBg) {
+                  var _selM = _shortModel(json.selected_model || '');
+                  var _ansM = _shortModel(json.answered_by || '');
+                  uiModule.showToast('⚠ ' + _selM + ' failed — answered by ' + _ansM, 6000);
+                  if (holder) {
+                    var _rEl = holder.querySelector('.role');
+                    if (_rEl) {
+                      var _tsS = _rEl.querySelector('.role-timestamp');
+                      _rEl.textContent = _ansM + ' (fallback) ';
+                      _rEl.title = (json.selected_model || '') + ' failed' +
+                        (json.reason ? ': ' + json.reason : '') + ' — answered by ' + (json.answered_by || '');
+                      _applyModelColor(_rEl, json.answered_by);
+                      if (_tsS) _rEl.appendChild(_tsS);
+                    }
+                  }
+                }
               } else if (json.type === 'attachments') {
                 if (_isBg) continue;
                 // Update user bubble — replace file chips with image previews
diff --git a/tests/test_llm_core_fallback.py b/tests/test_llm_core_fallback.py
new file mode 100644
index 0000000..9f30154
--- /dev/null
+++ b/tests/test_llm_core_fallback.py
@@ -0,0 +1,61 @@
+"""Tests for the fallback indicator in stream_llm_with_fallback.
+
+When the selected model fails *before output* and another candidate answers,
+a `fallback` event must be emitted so the switch is never masked under the
+selected model's name (which is how a misconfigured provider can look like it
+works while a different model silently answers).
+"""
+import json
+import asyncio
+
+from src import llm_core
+
+
+def _run_fallback(monkeypatch, per_model):
+    """Drive stream_llm_with_fallback with a stubbed stream_llm that returns a
+    canned SSE line list per candidate model. Returns the emitted chunks."""
+    async def fake_stream(url, model, messages, **kw):
+        for ln in per_model(model):
+            yield ln
+    monkeypatch.setattr(llm_core, "stream_llm", fake_stream)
+
+    async def run():
+        out = []
+        async for c in llm_core.stream_llm_with_fallback(
+            [("u1", "primary", {}), ("u2", "backup", {})], [{"role": "user", "content": "hi"}]
+        ):
+            out.append(c)
+        return out
+
+    return asyncio.run(run())
+
+
+def test_fallback_emits_indicator_when_primary_fails(monkeypatch):
+    def per_model(model):
+        if model == "primary":
+            return ['event: error\ndata: {"status": 400, "text": "Provider X returned HTTP 400"}\n\n']
+        return ['data: {"delta": "hello"}\n\n', "data: [DONE]\n\n"]
+    chunks = _run_fallback(monkeypatch, per_model)
+    fb = [json.loads(c[6:]) for c in chunks if c.startswith("data: ") and '"fallback"' in c]
+    assert fb, f"no fallback event in {chunks}"
+    assert fb[0]["type"] == "fallback"
+    assert fb[0]["selected_model"] == "primary"
+    assert fb[0]["answered_by"] == "backup"
+    assert "400" in fb[0]["reason"]
+    # the fallback notice must precede the answer content
+    order = [i for i, c in enumerate(chunks) if '"fallback"' in c or '"delta": "hello"' in c]
+    assert order == sorted(order)
+    assert any('"delta": "hello"' in c for c in chunks)
+
+
+def test_no_fallback_event_when_primary_succeeds(monkeypatch):
+    def per_model(model):
+        return ['data: {"delta": "ok"}\n\n', "data: [DONE]\n\n"]
+    chunks = _run_fallback(monkeypatch, per_model)
+    assert not any('"fallback"' in c for c in chunks)
+
+
+def test_summarize_stream_error():
+    assert "400" in llm_core._summarize_stream_error('event: error\ndata: {"status": 400, "text": "nope"}\n\n')
+    assert llm_core._summarize_stream_error(None) == "primary model failed"
+    assert llm_core._summarize_stream_error("garbage") == "primary model failed"