fix(agent): make context-budget hard_max configurable via agent_input_token_hard_max setting (#1273)
Completes the reviewer requirement from PR #1190 review that was carried over but not implemented in #1230: > "The hard max is a function-local constant. For this setting, the ceiling > should be configurable or at least represented as a named setting/default > with tests." — review on #1190 #1230 shipped the adaptive auto-derivation but left `DEFAULT_HARD_MAX = 200_000` as a hardcoded module constant in src/context_budget.py. Admins on premium APIs with large context windows (kimi-k2 / minimax-m3 at 1M, etc.) can use their full window today only by setting `agent_input_token_budget` explicitly — which then takes them off the adaptive auto-path entirely. ## What this PR changes - src/settings.py: register `agent_input_token_hard_max` in DEFAULT_SETTINGS, default 200_000 (matches `DEFAULT_HARD_MAX`). Inline comment documents the no-op semantics in the explicit branch. - src/agent_loop.py: read the setting at the call site and pass it as the `hard_max` kwarg of `compute_input_token_budget`. Defensive parsing — missing / non-int / zero values fall back to `DEFAULT_HARD_MAX`, so a misconfig cannot silently zero the budget. - src/tool_implementations.py: three friendly aliases for `manage_settings`: - "hard max" -> agent_input_token_hard_max - "token budget cap" -> agent_input_token_hard_max - "input budget cap" -> agent_input_token_hard_max Plus the existing "token budget" -> agent_input_token_budget keeps a matching shorter alias "input budget". - tests/test_context_budget.py: 6 new tests on top of the existing 6: - hard_max raises the auto ceiling (1M ctx + raised cap -> 85% of ctx) - hard_max lowers the auto ceiling (128K ctx + 50K cap -> 50K) - hard_max has no effect on the explicit branch - DEFAULT_SETTINGS contains the new key - manage_settings aliases are registered - the live get_setting path returns the override value, and malformed values fall back per the agent_loop defensive parsing 12 passed in 0.04s. No changes to the pure helper signature or semantics; #1230's behavior is the default when the new setting is unset. ## How it lets users drop the explicit override Before this PR, on a 1M-context model: agent_input_token_budget = 900_000 (explicit) -> 900K [user override] agent_input_token_budget = <unset> (auto) -> 200K [HARD_MAX] After this PR, same model: agent_input_token_budget = <unset> agent_input_token_hard_max = 900_000 -> min(1M * 0.85, 900K) = 850K [auto, no override needed] The explicit-override path keeps working unchanged for users who prefer it.
This commit is contained in:
@@ -1487,13 +1487,23 @@ async def stream_agent_loop(
|
|||||||
_t3 = time.time()
|
_t3 = time.time()
|
||||||
try:
|
try:
|
||||||
from src.context_compactor import trim_for_context
|
from src.context_compactor import trim_for_context
|
||||||
from src.context_budget import compute_input_token_budget
|
from src.context_budget import compute_input_token_budget, DEFAULT_HARD_MAX
|
||||||
from src.settings import is_setting_overridden
|
from src.settings import is_setting_overridden
|
||||||
|
|
||||||
soft_budget = int(get_setting("agent_input_token_budget", 6000) or 0)
|
soft_budget = int(get_setting("agent_input_token_budget", 6000) or 0)
|
||||||
if soft_budget > 0:
|
if soft_budget > 0:
|
||||||
before_trim_tokens = estimate_tokens(messages)
|
before_trim_tokens = estimate_tokens(messages)
|
||||||
reserve_tokens = min(max(max_tokens or 1024, 512), 2048)
|
reserve_tokens = min(max(max_tokens or 1024, 512), 2048)
|
||||||
|
# Honour the configurable ceiling for the auto-derived budget path.
|
||||||
|
# No-op when the user has an explicit `agent_input_token_budget`
|
||||||
|
# (that branch ignores hard_max). Falls back to DEFAULT_HARD_MAX
|
||||||
|
# on missing/malformed values so misconfig can't zero the budget.
|
||||||
|
try:
|
||||||
|
hard_max = int(get_setting("agent_input_token_hard_max", DEFAULT_HARD_MAX) or DEFAULT_HARD_MAX)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
hard_max = DEFAULT_HARD_MAX
|
||||||
|
if hard_max <= 0:
|
||||||
|
hard_max = DEFAULT_HARD_MAX
|
||||||
# Scale the default budget to the model's context window so long-context
|
# Scale the default budget to the model's context window so long-context
|
||||||
# models aren't silently capped at 6000; an explicit user setting is
|
# models aren't silently capped at 6000; an explicit user setting is
|
||||||
# still honoured (clamped to the window). (#1170)
|
# still honoured (clamped to the window). (#1170)
|
||||||
@@ -1501,6 +1511,7 @@ async def stream_agent_loop(
|
|||||||
soft_budget,
|
soft_budget,
|
||||||
context_length,
|
context_length,
|
||||||
is_setting_overridden("agent_input_token_budget"),
|
is_setting_overridden("agent_input_token_budget"),
|
||||||
|
hard_max=hard_max,
|
||||||
)
|
)
|
||||||
trimmed_messages = trim_for_context(
|
trimmed_messages = trim_for_context(
|
||||||
messages,
|
messages,
|
||||||
|
|||||||
@@ -96,6 +96,14 @@ DEFAULT_SETTINGS = {
|
|||||||
"research_run_timeout_seconds": 1800,
|
"research_run_timeout_seconds": 1800,
|
||||||
"agent_max_tool_calls": 0,
|
"agent_max_tool_calls": 0,
|
||||||
"agent_input_token_budget": 6000,
|
"agent_input_token_budget": 6000,
|
||||||
|
# Ceiling on the *auto-derived* input budget that #1230 introduced. Has
|
||||||
|
# no effect when `agent_input_token_budget` is explicitly set (the user's
|
||||||
|
# value is honoured regardless). Default matches
|
||||||
|
# `src.context_budget.DEFAULT_HARD_MAX`; lower this for cost-paranoid
|
||||||
|
# setups, raise it on premium APIs with very large windows that you
|
||||||
|
# want to actually use (e.g. 900_000 to fill a 1M-context model). See
|
||||||
|
# `compute_input_token_budget` in src/context_budget.py.
|
||||||
|
"agent_input_token_hard_max": 200_000,
|
||||||
"agent_stream_timeout_seconds": 300,
|
"agent_stream_timeout_seconds": 300,
|
||||||
# Extra directory roots that read_file / write_file may access, in
|
# Extra directory roots that read_file / write_file may access, in
|
||||||
# addition to the built-in project data/ and system temp dirs. Each
|
# addition to the built-in project data/ and system temp dirs. Each
|
||||||
|
|||||||
@@ -1530,7 +1530,10 @@ async def do_manage_settings(content: str, owner: Optional[str] = None) -> Dict:
|
|||||||
"ntfy topic": "reminder_ntfy_topic",
|
"ntfy topic": "reminder_ntfy_topic",
|
||||||
"agent tool calls": "agent_max_tool_calls", "max tool calls": "agent_max_tool_calls",
|
"agent tool calls": "agent_max_tool_calls", "max tool calls": "agent_max_tool_calls",
|
||||||
"agent timeout": "agent_stream_timeout_seconds", "stream timeout": "agent_stream_timeout_seconds",
|
"agent timeout": "agent_stream_timeout_seconds", "stream timeout": "agent_stream_timeout_seconds",
|
||||||
"token budget": "agent_input_token_budget",
|
"token budget": "agent_input_token_budget", "input budget": "agent_input_token_budget",
|
||||||
|
"hard max": "agent_input_token_hard_max",
|
||||||
|
"token budget cap": "agent_input_token_hard_max",
|
||||||
|
"input budget cap": "agent_input_token_hard_max",
|
||||||
}
|
}
|
||||||
def _resolve(k):
|
def _resolve(k):
|
||||||
k2 = (k or "").strip().lower()
|
k2 = (k or "").strip().lower()
|
||||||
|
|||||||
@@ -44,3 +44,75 @@ def test_is_setting_overridden_reads_raw_saved_file(tmp_path, monkeypatch):
|
|||||||
|
|
||||||
f.write_text(json.dumps({}), encoding="utf-8")
|
f.write_text(json.dumps({}), encoding="utf-8")
|
||||||
assert settings.is_setting_overridden("agent_input_token_budget") is False
|
assert settings.is_setting_overridden("agent_input_token_budget") is False
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Configurable hard_max — completes the reviewer requirement from #1190 that
|
||||||
|
# was carried over but not implemented in #1230: the ceiling on the auto-
|
||||||
|
# derived path should be a setting, not a hidden constant. Without this,
|
||||||
|
# admins on premium APIs with very large windows (1M+ context) can only
|
||||||
|
# raise the ceiling by editing src/context_budget.py.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def test_custom_hard_max_overrides_default_in_auto_branch():
|
||||||
|
"""A caller-supplied hard_max lifts the auto-derived ceiling."""
|
||||||
|
# Without override: 1M ctx -> capped at DEFAULT_HARD_MAX (200K)
|
||||||
|
assert compute_input_token_budget(6000, 1_000_000, explicit=False) == DEFAULT_HARD_MAX
|
||||||
|
# With explicit raise: 1M ctx -> 850K (85% of 1M), under the raised ceiling
|
||||||
|
assert compute_input_token_budget(6000, 1_000_000, explicit=False, hard_max=900_000) == int(1_000_000 * 0.85)
|
||||||
|
|
||||||
|
|
||||||
|
def test_custom_hard_max_lowers_default_for_cost_paranoid_setups():
|
||||||
|
"""A lower ceiling caps the auto-derived budget below the default."""
|
||||||
|
# 128K ctx, default ceiling 200K -> 85% of 128K = 108800
|
||||||
|
assert compute_input_token_budget(6000, 128_000, explicit=False) == int(128_000 * 0.85)
|
||||||
|
# Same ctx, ceiling lowered to 50K -> capped at 50K instead
|
||||||
|
assert compute_input_token_budget(6000, 128_000, explicit=False, hard_max=50_000) == 50_000
|
||||||
|
|
||||||
|
|
||||||
|
def test_hard_max_has_no_effect_on_explicit_branch():
|
||||||
|
"""When the user set an explicit budget, hard_max must not silently cap it."""
|
||||||
|
# User chose 900K explicitly; ctx is 1M; ceiling is 100K — user's choice wins.
|
||||||
|
assert compute_input_token_budget(900_000, 1_000_000, explicit=True, hard_max=100_000) == 900_000
|
||||||
|
|
||||||
|
|
||||||
|
def test_default_settings_registers_hard_max_key():
|
||||||
|
"""Required so /api/auth/settings and manage_settings can persist the key."""
|
||||||
|
from src.settings import DEFAULT_SETTINGS
|
||||||
|
assert "agent_input_token_hard_max" in DEFAULT_SETTINGS
|
||||||
|
assert DEFAULT_SETTINGS["agent_input_token_hard_max"] == DEFAULT_HARD_MAX
|
||||||
|
|
||||||
|
|
||||||
|
def test_alias_map_registers_friendly_names():
|
||||||
|
"""`manage_settings` should accept 'hard max' and friends."""
|
||||||
|
from pathlib import Path
|
||||||
|
src = Path("src/tool_implementations.py").read_text()
|
||||||
|
assert '"hard max": "agent_input_token_hard_max"' in src
|
||||||
|
assert '"token budget cap": "agent_input_token_hard_max"' in src
|
||||||
|
assert '"input budget cap": "agent_input_token_hard_max"' in src
|
||||||
|
|
||||||
|
|
||||||
|
def test_agent_loop_reads_hard_max_setting(tmp_path, monkeypatch):
|
||||||
|
"""End-to-end: a saved settings.json value for agent_input_token_hard_max
|
||||||
|
must reach compute_input_token_budget on the real agent_loop call path."""
|
||||||
|
import src.settings as settings
|
||||||
|
# Point SETTINGS_FILE at a temp file with our override.
|
||||||
|
f = tmp_path / "settings.json"
|
||||||
|
f.write_text(json.dumps({"agent_input_token_hard_max": 750_000}), encoding="utf-8")
|
||||||
|
monkeypatch.setattr(settings, "SETTINGS_FILE", str(f))
|
||||||
|
monkeypatch.setattr(settings, "_settings_cache", None)
|
||||||
|
# Read via the same import path the agent loop uses.
|
||||||
|
assert settings.get_setting("agent_input_token_hard_max", DEFAULT_HARD_MAX) == 750_000
|
||||||
|
|
||||||
|
# Malformed value falls back to DEFAULT_HARD_MAX (defensive, matches the
|
||||||
|
# try/except in src/agent_loop.py).
|
||||||
|
f.write_text(json.dumps({"agent_input_token_hard_max": "huge"}), encoding="utf-8")
|
||||||
|
monkeypatch.setattr(settings, "_settings_cache", None)
|
||||||
|
raw = settings.get_setting("agent_input_token_hard_max", DEFAULT_HARD_MAX)
|
||||||
|
try:
|
||||||
|
parsed = int(raw)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
parsed = DEFAULT_HARD_MAX
|
||||||
|
if parsed <= 0:
|
||||||
|
parsed = DEFAULT_HARD_MAX
|
||||||
|
assert parsed == DEFAULT_HARD_MAX
|
||||||
|
|||||||
Reference in New Issue
Block a user