_build_ollama_payload sends options.temperature and options.num_predict to /api/chat, but never options.num_ctx. Ollama defaults num_ctx to 2048 when the option is omitted, so prompts going to any Ollama backend are silently truncated there regardless of the model's actual capability. Thread the discovered context length through the three call sites (llm_call, llm_call_async, stream_llm) and emit options.num_ctx when it is known and positive. The builder filters out the DEFAULT_CONTEXT fallback (128000) so we don't lie to Ollama about models whose window we couldn't actually discover. The issue's literal 'when > 2048' heuristic is dropped: a model with a real context smaller than 2048 would OOM if Ollama used its default, so we pass the real value regardless of size. Matches how src/context_compactor.py uses the same helper. Sister fix to PR #753 — that PR teaches the compactor the right budget, this one tells Ollama to actually use that budget on the way in.
243 lines
9.0 KiB
Python
243 lines
9.0 KiB
Python
"""Regression tests for native Ollama Cloud provider handling."""
|
|
import httpx
|
|
|
|
from src import llm_core
|
|
|
|
|
|
def test_detects_ollama_cloud_native_provider():
|
|
assert llm_core._detect_provider("https://ollama.com/api") == "ollama"
|
|
assert llm_core._detect_provider("https://ollama.com/api/chat") == "ollama"
|
|
|
|
|
|
def test_llm_call_posts_native_ollama_payload(monkeypatch):
|
|
seen = {}
|
|
|
|
def fake_post(url, headers=None, json=None, timeout=None):
|
|
seen["url"] = url
|
|
seen["headers"] = headers
|
|
seen["json"] = json
|
|
seen["timeout"] = timeout
|
|
request = httpx.Request("POST", url)
|
|
return httpx.Response(
|
|
200,
|
|
request=request,
|
|
json={"message": {"content": "OK"}, "done": True},
|
|
)
|
|
|
|
monkeypatch.setattr(llm_core.httpx, "post", fake_post)
|
|
|
|
result = llm_core.llm_call(
|
|
"https://ollama.com/api",
|
|
"gpt-oss:120b-test",
|
|
[{"role": "user", "content": "Say OK"}],
|
|
temperature=0.2,
|
|
max_tokens=7,
|
|
headers={"Authorization": "Bearer ollama-key"},
|
|
timeout=11,
|
|
)
|
|
|
|
assert result == "OK"
|
|
assert seen["url"] == "https://ollama.com/api/chat"
|
|
assert seen["headers"]["Authorization"] == "Bearer ollama-key"
|
|
assert seen["json"]["stream"] is False
|
|
assert seen["json"]["options"] == {"temperature": 0.2, "num_predict": 7}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Tool-call argument serialization for native Ollama
|
|
#
|
|
# Odysseus carries assistant tool calls in the OpenAI shape, where
|
|
# `function.arguments` is a JSON *string*. Native Ollama /api/chat expects a
|
|
# JSON *object* and rejects the string form with HTTP 400 ("Value looks like
|
|
# object, but can't find closing '}' symbol"), aborting every follow-up
|
|
# (tool-result) round. _build_ollama_payload must parse it back to an object.
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _assistant_tool_call_msgs():
|
|
"""A canonical OpenAI-style assistant tool call + tool result, as produced by
|
|
agent_loop._append_tool_results (arguments are a JSON string)."""
|
|
return [
|
|
{"role": "user", "content": "what do you know about me?"},
|
|
{
|
|
"role": "assistant",
|
|
"content": None,
|
|
"tool_calls": [
|
|
{
|
|
"id": "call_0",
|
|
"type": "function",
|
|
"function": {"name": "app_api", "arguments": '{"action": "get_memory"}'},
|
|
}
|
|
],
|
|
},
|
|
{"role": "tool", "tool_call_id": "call_0", "content": "Memory: user is James."},
|
|
]
|
|
|
|
|
|
def test_ollama_payload_parses_string_arguments_to_object():
|
|
payload = llm_core._build_ollama_payload(
|
|
"gpt-oss:120b", _assistant_tool_call_msgs(), temperature=0.0, max_tokens=0,
|
|
)
|
|
asst = payload["messages"][1]
|
|
args = asst["tool_calls"][0]["function"]["arguments"]
|
|
# The whole point: arguments must be a dict, not the JSON string.
|
|
assert args == {"action": "get_memory"}
|
|
assert not isinstance(args, str)
|
|
assert asst["tool_calls"][0]["function"]["name"] == "app_api"
|
|
assert asst["tool_calls"][0]["id"] == "call_0"
|
|
|
|
|
|
def test_ollama_payload_drops_gemini_thought_signature():
|
|
"""A cross-provider fallback can hand Ollama a tool call that still carries
|
|
Gemini's opaque extra_content; it is meaningless to Ollama and must not leak."""
|
|
msgs = _assistant_tool_call_msgs()
|
|
msgs[1]["tool_calls"][0]["extra_content"] = {"google": {"thought_signature": "AAAA"}}
|
|
payload = llm_core._build_ollama_payload(
|
|
"gpt-oss:120b", msgs, temperature=0.0, max_tokens=0,
|
|
)
|
|
tc = payload["messages"][1]["tool_calls"][0]
|
|
assert "extra_content" not in tc
|
|
assert tc["function"]["arguments"] == {"action": "get_memory"}
|
|
|
|
|
|
def test_ollama_payload_leaves_plain_messages_untouched():
|
|
msgs = [{"role": "user", "content": "hello"}]
|
|
payload = llm_core._build_ollama_payload("m", msgs, temperature=0.0, max_tokens=0)
|
|
assert payload["messages"][0] == {"role": "user", "content": "hello"}
|
|
|
|
|
|
def test_ollama_payload_tolerates_malformed_arguments():
|
|
msgs = [{
|
|
"role": "assistant",
|
|
"tool_calls": [{"function": {"name": "x", "arguments": "{not json"}}],
|
|
}]
|
|
payload = llm_core._build_ollama_payload("m", msgs, temperature=0.0, max_tokens=0)
|
|
# Falls back to an empty object rather than raising.
|
|
assert payload["messages"][0]["tool_calls"][0]["function"]["arguments"] == {}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# num_ctx threading (issue #909)
|
|
#
|
|
# Ollama defaults num_ctx to 2048 when the option is omitted, so prompts
|
|
# going to any Ollama backend are silently truncated there regardless of
|
|
# the model's actual capability. The builder must accept a discovered
|
|
# context length and emit options.num_ctx — but only when the value is
|
|
# trusted and larger than 2048.
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_build_ollama_payload_emits_num_ctx_when_known_and_large():
|
|
"""num_ctx passes through when the caller supplies a trusted value
|
|
larger than Ollama's 2048 default."""
|
|
payload = llm_core._build_ollama_payload(
|
|
"kimi-k2", [{"role": "user", "content": "x"}],
|
|
temperature=0.5, max_tokens=100, num_ctx=131072,
|
|
)
|
|
assert payload["options"]["num_ctx"] == 131072
|
|
|
|
|
|
def test_build_ollama_payload_emits_num_ctx_for_small_known_models():
|
|
"""A model with a real context smaller than Ollama's 2048 default
|
|
would OOM if Ollama used its own default. Pass the real value."""
|
|
payload = llm_core._build_ollama_payload(
|
|
"tiny-llm", [{"role": "user", "content": "x"}],
|
|
temperature=0.5, max_tokens=100, num_ctx=1024,
|
|
)
|
|
assert payload["options"]["num_ctx"] == 1024
|
|
|
|
|
|
def test_build_ollama_payload_omits_none_and_zero():
|
|
"""None means the caller didn't look it up; 0 is nonsensical.
|
|
Both should be dropped, not emitted as a 0-context request."""
|
|
for ctx in (None, 0):
|
|
payload = llm_core._build_ollama_payload(
|
|
"m", [{"role": "user", "content": "x"}],
|
|
temperature=0.5, max_tokens=100, num_ctx=ctx,
|
|
)
|
|
assert "num_ctx" not in payload.get("options", {}), (
|
|
f"num_ctx={ctx} should not be emitted"
|
|
)
|
|
|
|
|
|
def test_build_ollama_payload_omits_default_context_fallback():
|
|
"""get_context_length returns DEFAULT_CONTEXT (128000) when it can't
|
|
discover the model's actual window. Emitting that as num_ctx would
|
|
lie to Ollama for unknown models, so the builder filters it out."""
|
|
from src.model_context import DEFAULT_CONTEXT
|
|
payload = llm_core._build_ollama_payload(
|
|
"unknown-llm-9001", [{"role": "user", "content": "x"}],
|
|
temperature=0.5, max_tokens=100, num_ctx=DEFAULT_CONTEXT,
|
|
)
|
|
assert "num_ctx" not in payload.get("options", {})
|
|
|
|
|
|
def test_llm_call_threads_discovered_num_ctx(monkeypatch):
|
|
"""When get_context_length returns a real, large value, it ends up
|
|
in the outgoing Ollama request as options.num_ctx (issue #909)."""
|
|
monkeypatch.setattr(llm_core, "get_context_length",
|
|
lambda url, model: 32768)
|
|
|
|
seen = {}
|
|
|
|
def fake_post(url, headers=None, json=None, timeout=None):
|
|
seen["json"] = json
|
|
request = httpx.Request("POST", url)
|
|
return httpx.Response(
|
|
200, request=request,
|
|
json={"message": {"content": "OK"}, "done": True},
|
|
)
|
|
|
|
monkeypatch.setattr(llm_core.httpx, "post", fake_post)
|
|
|
|
llm_core.llm_call(
|
|
"https://ollama.com/api",
|
|
"kimi-k2",
|
|
[{"role": "user", "content": "Say OK"}],
|
|
temperature=0.2,
|
|
max_tokens=7,
|
|
)
|
|
|
|
assert seen["json"]["options"]["num_ctx"] == 32768
|
|
|
|
|
|
def test_stream_llm_threads_discovered_num_ctx(monkeypatch):
|
|
"""stream_llm goes through the same ollama branch and must also
|
|
pass num_ctx through to the streaming request body."""
|
|
import asyncio
|
|
|
|
seen = {}
|
|
|
|
def spy_build_ollama_payload(*args, **kwargs):
|
|
seen["num_ctx"] = kwargs.get("num_ctx")
|
|
seen["stream"] = kwargs.get("stream")
|
|
return {
|
|
"model": "kimi-k2",
|
|
"messages": [{"role": "user", "content": "x"}],
|
|
"stream": True,
|
|
}
|
|
|
|
monkeypatch.setattr(llm_core, "get_context_length",
|
|
lambda url, model: 32768)
|
|
monkeypatch.setattr(llm_core, "_build_ollama_payload",
|
|
spy_build_ollama_payload)
|
|
|
|
# Short-circuit before the actual HTTP call: host is "dead" → yields
|
|
# an error SSE chunk and returns. The call to _build_ollama_payload
|
|
# still happens before the host check, so we can inspect it.
|
|
monkeypatch.setattr(llm_core, "_is_host_dead", lambda url: True)
|
|
|
|
async def collect():
|
|
return [chunk async for chunk in llm_core.stream_llm(
|
|
"https://ollama.com/api",
|
|
"kimi-k2",
|
|
[{"role": "user", "content": "Say OK"}],
|
|
temperature=0.2,
|
|
max_tokens=7,
|
|
)]
|
|
|
|
out = asyncio.run(collect())
|
|
|
|
assert seen["num_ctx"] == 32768
|
|
assert seen["stream"] is True
|
|
assert out # we got the SSE error chunk
|