feat: adapt agent_input_token_budget to the model context window (#1170) (#1230)

The agent soft-trims input context to `agent_input_token_budget` (default 6000). The old computation `min(context_length or budget, budget)` made the 6000 default a hard ceiling for every model, so 128K/1M context models were silently capped at 6000 input tokens — now that num_ctx is sent correctly (#1056), this was the last barrier to actually using a long context window. This derives the default budget from the model's discovered context window (~85%, capped at a generous hard max) while honouring an explicit user setting exactly (clamped to the window). When the window is unknown it falls back to the previous value, so behaviour is unchanged for that case. - src/context_budget.py: pure `compute_input_token_budget()` (unit-testable) - src/settings.py: `is_setting_overridden()` to tell an explicit user value from the merged default (load_settings merges DEFAULT_SETTINGS, so equality alone can't distinguish them) - src/agent_loop.py: use the helper in the soft-trim path Covered by tests/test_context_budget.py (6 cases). Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-02 23:13:53 +08:00
parent 1fda906407
commit 8c376d2b0e
4 changed files with 126 additions and 1 deletions
--- a/src/agent_loop.py
+++ b/src/agent_loop.py
@@ -1487,12 +1487,21 @@ async def stream_agent_loop(
    _t3 = time.time()
    try:
        from src.context_compactor import trim_for_context
+        from src.context_budget import compute_input_token_budget
+        from src.settings import is_setting_overridden

        soft_budget = int(get_setting("agent_input_token_budget", 6000) or 0)
        if soft_budget > 0:
            before_trim_tokens = estimate_tokens(messages)
            reserve_tokens = min(max(max_tokens or 1024, 512), 2048)
-            effective_budget = min(context_length or soft_budget, soft_budget)
+            # Scale the default budget to the model's context window so long-context
+            # models aren't silently capped at 6000; an explicit user setting is
+            # still honoured (clamped to the window). (#1170)
+            effective_budget = compute_input_token_budget(
+                soft_budget,
+                context_length,
+                is_setting_overridden("agent_input_token_budget"),
+            )
            trimmed_messages = trim_for_context(
                messages,
                effective_budget,
--- a/src/context_budget.py
+++ b/src/context_budget.py
@@ -0,0 +1,55 @@
+"""Adaptive input-token budget for the agent loop (#1170).
+
+The agent soft-trims its input context to ``agent_input_token_budget`` (default
+6000). The old computation was ``min(context_length or budget, budget)``, which
+made the 6000 default a hard ceiling for *every* model — so a 128K or 1M context
+model was silently capped at 6000 input tokens even though it can hold far more.
+
+This derives the effective budget from the model's discovered context window when
+the user has NOT set an explicit budget, while still honouring an explicit setting
+exactly (clamped to the window). Pure and side-effect free so it is unit-testable.
+"""
+
+# Generous ceiling so long-context models are unblocked without sending a
+# pathologically large prompt every agent turn. Tunable; chosen to fully cover
+# 128K models and give 1M models a large but bounded budget.
+DEFAULT_HARD_MAX = 200_000
+DEFAULT_BUDGET = 6000
+DEFAULT_HEADROOM = 0.85
+
+
+def compute_input_token_budget(
+    configured: int,
+    context_length: int,
+    explicit: bool,
+    *,
+    default: int = DEFAULT_BUDGET,
+    headroom: float = DEFAULT_HEADROOM,
+    hard_max: int = DEFAULT_HARD_MAX,
+) -> int:
+    """Return the effective soft input-token budget.
+
+    Args:
+        configured: the value read from settings (may be the default).
+        context_length: the model's discovered context window (0/unknown if none).
+        explicit: True if the user explicitly set ``agent_input_token_budget``.
+
+    Rules:
+        - Explicit user budget is honoured exactly, only clamped to the model's
+          window when that window is known (never send more than the model holds).
+        - Otherwise (default), scale to ``headroom`` of the context window, capped
+          at ``hard_max`` — so long-context models use their capacity.
+        - When the window is unknown, fall back to the configured/default value
+          (preserving the previous behaviour).
+    """
+    configured = int(configured or 0)
+    context_length = int(context_length or 0)
+
+    if explicit and configured > 0:
+        return min(configured, context_length) if context_length > 0 else configured
+
+    if context_length > 0:
+        scaled = int(context_length * headroom)
+        return max(1, min(scaled, hard_max))
+
+    return configured if configured > 0 else default
--- a/src/settings.py
+++ b/src/settings.py
@@ -195,6 +195,21 @@ def get_setting(key: str, default: Any = None) -> Any:
    return load_settings().get(key, default)


+def is_setting_overridden(key: str) -> bool:
+    """True if ``key`` is explicitly present in the saved settings file.
+
+    ``load_settings`` merges DEFAULT_SETTINGS with the saved file, so a value
+    equal to its default is indistinguishable from "never set" via get_setting.
+    Callers that need to treat an explicit user choice differently from the
+    default (e.g. adaptive budgets) use this to read the raw saved file.
+    """
+    try:
+        with open(SETTINGS_FILE, "r", encoding="utf-8") as f:
+            return key in json.load(f)
+    except (FileNotFoundError, json.JSONDecodeError):
+        return False
+
+
 # Per-user settings (user prefs override the global admin default). Used for
 # keys that a user is allowed to choose individually — currently the vision
 # model + image-generation model. The owner argument is the authed username
--- a/tests/test_context_budget.py
+++ b/tests/test_context_budget.py
@@ -0,0 +1,46 @@
+"""Issue #1170 — the agent input-token budget adapts to the model context window.
+
+Pins the pure budget computation and the explicit-override detection.
+"""
+
+import json
+
+from src.context_budget import compute_input_token_budget, DEFAULT_HARD_MAX
+
+
+def test_default_scales_to_context_window():
+    # Not explicit, big window -> ~85% of the window (the old code capped at 6000).
+    assert compute_input_token_budget(6000, 128000, explicit=False) == int(128000 * 0.85)
+
+
+def test_default_capped_at_hard_max_for_huge_windows():
+    assert compute_input_token_budget(6000, 1_000_000, explicit=False) == DEFAULT_HARD_MAX
+
+
+def test_explicit_budget_is_honoured():
+    # User explicitly chose 6000 -> keep it even on a 128K model.
+    assert compute_input_token_budget(6000, 128000, explicit=True) == 6000
+    # A larger explicit budget is honoured too, clamped to the window.
+    assert compute_input_token_budget(50000, 128000, explicit=True) == 50000
+
+
+def test_explicit_budget_clamped_to_window():
+    assert compute_input_token_budget(200000, 32000, explicit=True) == 32000
+
+
+def test_unknown_window_falls_back_to_configured():
+    assert compute_input_token_budget(6000, 0, explicit=False) == 6000
+    assert compute_input_token_budget(0, 0, explicit=False) == 6000  # default
+
+
+def test_is_setting_overridden_reads_raw_saved_file(tmp_path, monkeypatch):
+    import src.settings as settings
+
+    f = tmp_path / "settings.json"
+    f.write_text(json.dumps({"agent_input_token_budget": 12000}), encoding="utf-8")
+    monkeypatch.setattr(settings, "SETTINGS_FILE", str(f))
+    assert settings.is_setting_overridden("agent_input_token_budget") is True
+    assert settings.is_setting_overridden("some_unset_key") is False
+
+    f.write_text(json.dumps({}), encoding="utf-8")
+    assert settings.is_setting_overridden("agent_input_token_budget") is False