When the selected model fails before producing output, stream_llm_with_fallback
quietly switches to the next candidate and the reply is shown under the
originally selected model's name, so a misconfigured provider looks like it
works. (Concretely: a Bedrock gateway that 400s every Anthropic/Claude request
appears fine because another model silently answers under the Claude label.)
Emit a `fallback` SSE event ({selected_model, answered_by, reason}) the first
time a non-primary candidate produces output, forward it through the agent loop
and both chat-route paths, stamp the response metrics with the model that
actually answered, and show a notice + relabel the reply in the UI.
Tested: python -m pytest tests/test_llm_core_fallback.py (3 pass);
python -m py_compile src/llm_core.py src/agent_loop.py routes/chat_routes.py;
node --check static/js/chat.js.
62 lines
2.5 KiB
Python
62 lines
2.5 KiB
Python
"""Tests for the fallback indicator in stream_llm_with_fallback.
|
|
|
|
When the selected model fails *before output* and another candidate answers,
|
|
a `fallback` event must be emitted so the switch is never masked under the
|
|
selected model's name (which is how a misconfigured provider can look like it
|
|
works while a different model silently answers).
|
|
"""
|
|
import json
|
|
import asyncio
|
|
|
|
from src import llm_core
|
|
|
|
|
|
def _run_fallback(monkeypatch, per_model):
|
|
"""Drive stream_llm_with_fallback with a stubbed stream_llm that returns a
|
|
canned SSE line list per candidate model. Returns the emitted chunks."""
|
|
async def fake_stream(url, model, messages, **kw):
|
|
for ln in per_model(model):
|
|
yield ln
|
|
monkeypatch.setattr(llm_core, "stream_llm", fake_stream)
|
|
|
|
async def run():
|
|
out = []
|
|
async for c in llm_core.stream_llm_with_fallback(
|
|
[("u1", "primary", {}), ("u2", "backup", {})], [{"role": "user", "content": "hi"}]
|
|
):
|
|
out.append(c)
|
|
return out
|
|
|
|
return asyncio.run(run())
|
|
|
|
|
|
def test_fallback_emits_indicator_when_primary_fails(monkeypatch):
|
|
def per_model(model):
|
|
if model == "primary":
|
|
return ['event: error\ndata: {"status": 400, "text": "Provider X returned HTTP 400"}\n\n']
|
|
return ['data: {"delta": "hello"}\n\n', "data: [DONE]\n\n"]
|
|
chunks = _run_fallback(monkeypatch, per_model)
|
|
fb = [json.loads(c[6:]) for c in chunks if c.startswith("data: ") and '"fallback"' in c]
|
|
assert fb, f"no fallback event in {chunks}"
|
|
assert fb[0]["type"] == "fallback"
|
|
assert fb[0]["selected_model"] == "primary"
|
|
assert fb[0]["answered_by"] == "backup"
|
|
assert "400" in fb[0]["reason"]
|
|
# the fallback notice must precede the answer content
|
|
order = [i for i, c in enumerate(chunks) if '"fallback"' in c or '"delta": "hello"' in c]
|
|
assert order == sorted(order)
|
|
assert any('"delta": "hello"' in c for c in chunks)
|
|
|
|
|
|
def test_no_fallback_event_when_primary_succeeds(monkeypatch):
|
|
def per_model(model):
|
|
return ['data: {"delta": "ok"}\n\n', "data: [DONE]\n\n"]
|
|
chunks = _run_fallback(monkeypatch, per_model)
|
|
assert not any('"fallback"' in c for c in chunks)
|
|
|
|
|
|
def test_summarize_stream_error():
|
|
assert "400" in llm_core._summarize_stream_error('event: error\ndata: {"status": 400, "text": "nope"}\n\n')
|
|
assert llm_core._summarize_stream_error(None) == "primary model failed"
|
|
assert llm_core._summarize_stream_error("garbage") == "primary model failed"
|