feat: adapt agent_input_token_budget to the model context window (#1170) (#1230)

The agent soft-trims input context to `agent_input_token_budget` (default 6000).
The old computation `min(context_length or budget, budget)` made the 6000 default
a hard ceiling for every model, so 128K/1M context models were silently capped at
6000 input tokens — now that num_ctx is sent correctly (#1056), this was the last
barrier to actually using a long context window.

This derives the default budget from the model's discovered context window
(~85%, capped at a generous hard max) while honouring an explicit user setting
exactly (clamped to the window). When the window is unknown it falls back to the
previous value, so behaviour is unchanged for that case.

- src/context_budget.py: pure `compute_input_token_budget()` (unit-testable)
- src/settings.py: `is_setting_overridden()` to tell an explicit user value from
  the merged default (load_settings merges DEFAULT_SETTINGS, so equality alone
  can't distinguish them)
- src/agent_loop.py: use the helper in the soft-trim path

Covered by tests/test_context_budget.py (6 cases).

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
lekt8
2026-06-02 23:13:53 +08:00
committed by GitHub
parent 1fda906407
commit 8c376d2b0e
4 changed files with 126 additions and 1 deletions

View File

@@ -1487,12 +1487,21 @@ async def stream_agent_loop(
_t3 = time.time()
try:
from src.context_compactor import trim_for_context
from src.context_budget import compute_input_token_budget
from src.settings import is_setting_overridden
soft_budget = int(get_setting("agent_input_token_budget", 6000) or 0)
if soft_budget > 0:
before_trim_tokens = estimate_tokens(messages)
reserve_tokens = min(max(max_tokens or 1024, 512), 2048)
effective_budget = min(context_length or soft_budget, soft_budget)
# Scale the default budget to the model's context window so long-context
# models aren't silently capped at 6000; an explicit user setting is
# still honoured (clamped to the window). (#1170)
effective_budget = compute_input_token_budget(
soft_budget,
context_length,
is_setting_overridden("agent_input_token_budget"),
)
trimmed_messages = trim_for_context(
messages,
effective_budget,

55
src/context_budget.py Normal file
View File

@@ -0,0 +1,55 @@
"""Adaptive input-token budget for the agent loop (#1170).
The agent soft-trims its input context to ``agent_input_token_budget`` (default
6000). The old computation was ``min(context_length or budget, budget)``, which
made the 6000 default a hard ceiling for *every* model — so a 128K or 1M context
model was silently capped at 6000 input tokens even though it can hold far more.
This derives the effective budget from the model's discovered context window when
the user has NOT set an explicit budget, while still honouring an explicit setting
exactly (clamped to the window). Pure and side-effect free so it is unit-testable.
"""
# Generous ceiling so long-context models are unblocked without sending a
# pathologically large prompt every agent turn. Tunable; chosen to fully cover
# 128K models and give 1M models a large but bounded budget.
DEFAULT_HARD_MAX = 200_000
DEFAULT_BUDGET = 6000
DEFAULT_HEADROOM = 0.85
def compute_input_token_budget(
configured: int,
context_length: int,
explicit: bool,
*,
default: int = DEFAULT_BUDGET,
headroom: float = DEFAULT_HEADROOM,
hard_max: int = DEFAULT_HARD_MAX,
) -> int:
"""Return the effective soft input-token budget.
Args:
configured: the value read from settings (may be the default).
context_length: the model's discovered context window (0/unknown if none).
explicit: True if the user explicitly set ``agent_input_token_budget``.
Rules:
- Explicit user budget is honoured exactly, only clamped to the model's
window when that window is known (never send more than the model holds).
- Otherwise (default), scale to ``headroom`` of the context window, capped
at ``hard_max`` — so long-context models use their capacity.
- When the window is unknown, fall back to the configured/default value
(preserving the previous behaviour).
"""
configured = int(configured or 0)
context_length = int(context_length or 0)
if explicit and configured > 0:
return min(configured, context_length) if context_length > 0 else configured
if context_length > 0:
scaled = int(context_length * headroom)
return max(1, min(scaled, hard_max))
return configured if configured > 0 else default

View File

@@ -195,6 +195,21 @@ def get_setting(key: str, default: Any = None) -> Any:
return load_settings().get(key, default)
def is_setting_overridden(key: str) -> bool:
"""True if ``key`` is explicitly present in the saved settings file.
``load_settings`` merges DEFAULT_SETTINGS with the saved file, so a value
equal to its default is indistinguishable from "never set" via get_setting.
Callers that need to treat an explicit user choice differently from the
default (e.g. adaptive budgets) use this to read the raw saved file.
"""
try:
with open(SETTINGS_FILE, "r", encoding="utf-8") as f:
return key in json.load(f)
except (FileNotFoundError, json.JSONDecodeError):
return False
# Per-user settings (user prefs override the global admin default). Used for
# keys that a user is allowed to choose individually — currently the vision
# model + image-generation model. The owner argument is the authed username

View File

@@ -0,0 +1,46 @@
"""Issue #1170 — the agent input-token budget adapts to the model context window.
Pins the pure budget computation and the explicit-override detection.
"""
import json
from src.context_budget import compute_input_token_budget, DEFAULT_HARD_MAX
def test_default_scales_to_context_window():
# Not explicit, big window -> ~85% of the window (the old code capped at 6000).
assert compute_input_token_budget(6000, 128000, explicit=False) == int(128000 * 0.85)
def test_default_capped_at_hard_max_for_huge_windows():
assert compute_input_token_budget(6000, 1_000_000, explicit=False) == DEFAULT_HARD_MAX
def test_explicit_budget_is_honoured():
# User explicitly chose 6000 -> keep it even on a 128K model.
assert compute_input_token_budget(6000, 128000, explicit=True) == 6000
# A larger explicit budget is honoured too, clamped to the window.
assert compute_input_token_budget(50000, 128000, explicit=True) == 50000
def test_explicit_budget_clamped_to_window():
assert compute_input_token_budget(200000, 32000, explicit=True) == 32000
def test_unknown_window_falls_back_to_configured():
assert compute_input_token_budget(6000, 0, explicit=False) == 6000
assert compute_input_token_budget(0, 0, explicit=False) == 6000 # default
def test_is_setting_overridden_reads_raw_saved_file(tmp_path, monkeypatch):
import src.settings as settings
f = tmp_path / "settings.json"
f.write_text(json.dumps({"agent_input_token_budget": 12000}), encoding="utf-8")
monkeypatch.setattr(settings, "SETTINGS_FILE", str(f))
assert settings.is_setting_overridden("agent_input_token_budget") is True
assert settings.is_setting_overridden("some_unset_key") is False
f.write_text(json.dumps({}), encoding="utf-8")
assert settings.is_setting_overridden("agent_input_token_budget") is False