fix(model-context): key context-window cache by (endpoint, model) (#2614)

get_context_length() cached the resolved context window by model id alone, so two different remote endpoints serving the same model id (e.g. a capped proxy at 8k vs. the full provider at 200k) collided: the first to resolve won process-wide and the other endpoint was served the wrong window. That silently over-trims conversations on the larger-window endpoint (it feeds context_compactor) or overflows the smaller one (provider 400s). Key the cache on (endpoint_url, model). Local endpoints already always re-query, so they are unaffected. Fixes #2603
2026-06-05 00:50:56 +00:00
parent f8cf791491
commit 19a3fc59c9
2 changed files with 50 additions and 6 deletions
--- a/tests/test_context_cache_per_endpoint.py
+++ b/tests/test_context_cache_per_endpoint.py
@@ -0,0 +1,39 @@
+"""Regression for #2603 — model context-window cache must be keyed per endpoint.
+
+`get_context_length()` cached by model id alone, so two different remote endpoints
+serving the same model id (e.g. a capped proxy at 8k vs. the full provider at 200k)
+collided: whichever resolved first won process-wide and the other was served the
+wrong window. The fix keys the cache on (endpoint_url, model).
+"""
+
+import src.model_context as mc
+
+
+def _setup(monkeypatch, windows):
+    """windows: {endpoint_url: context_length}. Force the remote path."""
+    monkeypatch.setattr(mc, "_is_local_endpoint", lambda url: False)
+    monkeypatch.setattr(mc, "_configured_endpoint_kind", lambda url: "api")
+    monkeypatch.setattr(mc, "_query_context_length", lambda url, model: windows[url])
+    mc._context_cache.clear()
+
+
+def test_same_model_two_remote_endpoints_get_their_own_window(monkeypatch):
+    a, b = "https://proxy-a.example/v1", "https://provider-b.example/v1"
+    _setup(monkeypatch, {a: 8000, b: 200000})
+
+    assert mc.get_context_length(a, "shared-model") == 8000
+    # Same model id, different endpoint: must NOT return endpoint A's cached 8000.
+    assert mc.get_context_length(b, "shared-model") == 200000
+
+
+def test_cache_hit_still_works_per_endpoint(monkeypatch):
+    a, b = "https://proxy-a.example/v1", "https://provider-b.example/v1"
+    _setup(monkeypatch, {a: 8000, b: 200000})
+    mc.get_context_length(a, "shared-model")
+    mc.get_context_length(b, "shared-model")
+
+    # Both endpoints are now cached under their own key; flip the underlying
+    # query to prove subsequent reads come from the per-endpoint cache, not a re-query.
+    monkeypatch.setattr(mc, "_query_context_length", lambda url, model: 999)
+    assert mc.get_context_length(a, "shared-model") == 8000
+    assert mc.get_context_length(b, "shared-model") == 200000