From 6776c7d69105b78bc1865100f9f3d0e4c90acb35 Mon Sep 17 00:00:00 2001 From: James Arslan Date: Tue, 2 Jun 2026 04:37:25 +0200 Subject: [PATCH] Surface silent model fallback instead of masking it (#868) When the selected model fails before producing output, stream_llm_with_fallback quietly switches to the next candidate and the reply is shown under the originally selected model's name, so a misconfigured provider looks like it works. (Concretely: a Bedrock gateway that 400s every Anthropic/Claude request appears fine because another model silently answers under the Claude label.) Emit a `fallback` SSE event ({selected_model, answered_by, reason}) the first time a non-primary candidate produces output, forward it through the agent loop and both chat-route paths, stamp the response metrics with the model that actually answered, and show a notice + relabel the reply in the UI. Tested: python -m pytest tests/test_llm_core_fallback.py (3 pass); python -m py_compile src/llm_core.py src/agent_loop.py routes/chat_routes.py; node --check static/js/chat.js. --- routes/chat_routes.py | 18 ++++++++-- src/agent_loop.py | 6 ++++ src/llm_core.py | 32 +++++++++++++++++ static/js/chat.js | 20 +++++++++++ tests/test_llm_core_fallback.py | 61 +++++++++++++++++++++++++++++++++ 5 files changed, 135 insertions(+), 2 deletions(-) create mode 100644 tests/test_llm_core_fallback.py diff --git a/routes/chat_routes.py b/routes/chat_routes.py index b2e9dd5..84143d0 100644 --- a/routes/chat_routes.py +++ b/routes/chat_routes.py @@ -769,6 +769,7 @@ def setup_chat_routes( return elif chat_mode == "chat": _chat_start = time.time() + _answered_by = None # set if the selected model failed and a fallback answered # ── Chat mode: call stream_llm directly, NO tools, NO document access ── try: _chat_candidates = [(sess.endpoint_url, sess.model, sess.headers)] + _fallback_candidates @@ -797,9 +798,14 @@ def setup_chat_routes( full_response += data["delta"] _stream_set(session, partial=full_response) yield chunk + elif data.get("type") == "fallback": + # Selected model failed; a fallback answered. + # Forward the notice and remember the real model. + _answered_by = data.get("answered_by") or _answered_by + yield chunk elif data.get("type") == "usage": last_metrics = data.get("data", {}) - last_metrics["model"] = sess.model + last_metrics["model"] = _answered_by or sess.model if ctx.context_length and last_metrics.get("input_tokens"): pct = min(round((last_metrics["input_tokens"] / ctx.context_length) * 100, 1), 100.0) last_metrics["context_percent"] = pct @@ -867,6 +873,7 @@ def setup_chat_routes( # ── Agent mode: full agent loop with tools ── _agent_rounds = 0 _agent_tool_calls = 0 + _answered_by = None # set if the selected model failed and a fallback answered try: from src.settings import get_setting _tool_budget = int(get_setting("agent_max_tool_calls", 0)) @@ -911,9 +918,16 @@ def setup_chat_routes( elif data.get("type") == "tool_start": _agent_tool_calls += 1 yield chunk + elif data.get("type") == "fallback": + # Selected model failed; a fallback answered. + # Forward the notice and remember the real + # model so metrics reflect it, not the masked + # selected model. + _answered_by = data.get("answered_by") or _answered_by + yield chunk elif data.get("type") == "metrics": last_metrics = data.get("data", {}) - last_metrics["model"] = sess.model + last_metrics["model"] = _answered_by or sess.model yield f'data: {json.dumps({"type": "metrics", "data": last_metrics})}\n\n' except json.JSONDecodeError: yield chunk diff --git a/src/agent_loop.py b/src/agent_loop.py index b72a855..9c25e59 100644 --- a/src/agent_loop.py +++ b/src/agent_loop.py @@ -1638,6 +1638,12 @@ async def stream_agent_loop( real_output_tokens += u.get("output_tokens", 0) last_round_input_tokens = round_input has_real_usage = True + elif data.get("type") == "fallback": + # The selected model failed and another answered; surface + # the notice so a misconfigured provider isn't masked. + logger.warning(f"[agent] round {round_num} fell back: " + f"{data.get('selected_model')} -> {data.get('answered_by')}") + yield chunk elif "delta" in data: if not first_token_received: time_to_first_token = time.time() - total_start diff --git a/src/llm_core.py b/src/llm_core.py index 8e22b96..1ca4bf6 100644 --- a/src/llm_core.py +++ b/src/llm_core.py @@ -1148,6 +1148,24 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl yield f'event: error\ndata: {json.dumps({"error": str(e), "status": 502})}\n\n' +def _summarize_stream_error(err_chunk: Optional[str]) -> str: + """Pull a short human reason out of an `event: error` SSE chunk for the + fallback notice. Returns a generic message if it can't be parsed.""" + if not err_chunk: + return "primary model failed" + try: + for line in err_chunk.split("\n"): + if line.startswith("data: "): + j = json.loads(line[6:]) + txt = j.get("text") or j.get("error") or "" + status = j.get("status") + msg = (f"HTTP {status}: " if status else "") + str(txt) + return msg[:200].strip() or "primary model failed" + except Exception: + pass + return "primary model failed" + + async def stream_llm_with_fallback(candidates, messages, **kwargs): """Wrap stream_llm with an ordered fallback chain. @@ -1166,6 +1184,7 @@ async def stream_llm_with_fallback(candidates, messages, **kwargs): yield f'event: error\ndata: {json.dumps({"error": "No model endpoint configured", "status": 503})}\n\n' return + primary_model = cands[0][1] last_error = None for i, (url, model, headers) in enumerate(cands): is_last = (i == len(cands) - 1) @@ -1187,6 +1206,19 @@ async def stream_llm_with_fallback(candidates, messages, **kwargs): continue # Any data chunk other than the terminal [DONE] means real output. if chunk.startswith("data: ") and not chunk.startswith("data: [DONE]"): + # First real output from a NON-primary candidate: tell the client + # the selected model failed and another answered. Without this the + # fallback is invisible — a misconfigured provider looks like it + # works because the reply is shown under the originally selected + # model's name (e.g. a Bedrock/Claude endpoint that 400s every + # request but appears fine because another model silently answered). + if not emitted and i > 0: + yield ('data: ' + json.dumps({ + "type": "fallback", + "selected_model": primary_model, + "answered_by": model, + "reason": _summarize_stream_error(last_error), + }) + '\n\n') emitted = True yield chunk if not retried: diff --git a/static/js/chat.js b/static/js/chat.js index 3448686..4a7632c 100644 --- a/static/js/chat.js +++ b/static/js/chat.js @@ -1771,6 +1771,26 @@ import createResearchSynapse from './researchSynapse.js'; if (tsSpan) roleEl.appendChild(tsSpan); } } + } else if (json.type === 'fallback') { + // The selected model failed and another provider answered. Make + // it visible so a misconfigured provider is never silently + // masked under the selected model's name. + if (!_isBg) { + var _selM = _shortModel(json.selected_model || ''); + var _ansM = _shortModel(json.answered_by || ''); + uiModule.showToast('⚠ ' + _selM + ' failed — answered by ' + _ansM, 6000); + if (holder) { + var _rEl = holder.querySelector('.role'); + if (_rEl) { + var _tsS = _rEl.querySelector('.role-timestamp'); + _rEl.textContent = _ansM + ' (fallback) '; + _rEl.title = (json.selected_model || '') + ' failed' + + (json.reason ? ': ' + json.reason : '') + ' — answered by ' + (json.answered_by || ''); + _applyModelColor(_rEl, json.answered_by); + if (_tsS) _rEl.appendChild(_tsS); + } + } + } } else if (json.type === 'attachments') { if (_isBg) continue; // Update user bubble — replace file chips with image previews diff --git a/tests/test_llm_core_fallback.py b/tests/test_llm_core_fallback.py new file mode 100644 index 0000000..9f30154 --- /dev/null +++ b/tests/test_llm_core_fallback.py @@ -0,0 +1,61 @@ +"""Tests for the fallback indicator in stream_llm_with_fallback. + +When the selected model fails *before output* and another candidate answers, +a `fallback` event must be emitted so the switch is never masked under the +selected model's name (which is how a misconfigured provider can look like it +works while a different model silently answers). +""" +import json +import asyncio + +from src import llm_core + + +def _run_fallback(monkeypatch, per_model): + """Drive stream_llm_with_fallback with a stubbed stream_llm that returns a + canned SSE line list per candidate model. Returns the emitted chunks.""" + async def fake_stream(url, model, messages, **kw): + for ln in per_model(model): + yield ln + monkeypatch.setattr(llm_core, "stream_llm", fake_stream) + + async def run(): + out = [] + async for c in llm_core.stream_llm_with_fallback( + [("u1", "primary", {}), ("u2", "backup", {})], [{"role": "user", "content": "hi"}] + ): + out.append(c) + return out + + return asyncio.run(run()) + + +def test_fallback_emits_indicator_when_primary_fails(monkeypatch): + def per_model(model): + if model == "primary": + return ['event: error\ndata: {"status": 400, "text": "Provider X returned HTTP 400"}\n\n'] + return ['data: {"delta": "hello"}\n\n', "data: [DONE]\n\n"] + chunks = _run_fallback(monkeypatch, per_model) + fb = [json.loads(c[6:]) for c in chunks if c.startswith("data: ") and '"fallback"' in c] + assert fb, f"no fallback event in {chunks}" + assert fb[0]["type"] == "fallback" + assert fb[0]["selected_model"] == "primary" + assert fb[0]["answered_by"] == "backup" + assert "400" in fb[0]["reason"] + # the fallback notice must precede the answer content + order = [i for i, c in enumerate(chunks) if '"fallback"' in c or '"delta": "hello"' in c] + assert order == sorted(order) + assert any('"delta": "hello"' in c for c in chunks) + + +def test_no_fallback_event_when_primary_succeeds(monkeypatch): + def per_model(model): + return ['data: {"delta": "ok"}\n\n', "data: [DONE]\n\n"] + chunks = _run_fallback(monkeypatch, per_model) + assert not any('"fallback"' in c for c in chunks) + + +def test_summarize_stream_error(): + assert "400" in llm_core._summarize_stream_error('event: error\ndata: {"status": 400, "text": "nope"}\n\n') + assert llm_core._summarize_stream_error(None) == "primary model failed" + assert llm_core._summarize_stream_error("garbage") == "primary model failed"