Fix Ollama agent single-token responses (#1591)

Agent mode treated local /v1 endpoints, including Ollama on :11434, as native-tool-capable by host/model heuristics. On Ollama's OpenAI-compatible surface some models that advertise tool support stop after a single token when schemas are sent (issue #1567). Default local Ollama /v1 back to fenced tool blocks unless the endpoint explicitly has supports_tools=True.

Also compare both the runtime chat URL and the normalized endpoint base when reading ModelEndpoint.supports_tools. That keeps a saved base URL such as http://localhost:11434/v1 effective when the active session URL is /v1/chat/completions.

Tests: .venv/bin/python -m pytest tests/test_tool_support_heuristic.py
This commit is contained in:
Marius Popa
2026-06-04 13:45:10 +03:00
committed by GitHub
parent 965185c6f9
commit dc365a1b27
2 changed files with 111 additions and 17 deletions

View File

@@ -13,6 +13,7 @@ import re
import time
import logging
from typing import AsyncGenerator, List, Dict, Optional, Set
from urllib.parse import urlparse
from src.llm_core import stream_llm, stream_llm_with_fallback, _is_ollama_native_url
from src.model_context import estimate_tokens
@@ -475,6 +476,45 @@ _ADMIN_SCHEMA_NAMES = frozenset([
])
_TOOL_SELECTION_TIMEOUT_SECONDS = 1.5
def _is_ollama_openai_compat_url(endpoint_url: str) -> bool:
"""Return True for local Ollama's OpenAI-compatible /v1 surface.
Ollama's /v1 endpoint accepts the OpenAI chat shape, but model-level tool
streaming is uneven. Some local models terminate after a token when schemas
are present. Keep native schemas opt-in via ModelEndpoint.supports_tools.
"""
try:
parsed = urlparse(endpoint_url or "")
except Exception:
return False
path = (parsed.path or "").rstrip("/")
return parsed.port == 11434 and (path == "/v1" or path.startswith("/v1/"))
def _endpoint_lookup_keys(endpoint_url: str) -> List[str]:
"""Candidate ModelEndpoint.base_url keys for a runtime chat URL."""
raw = (endpoint_url or "").strip()
keys: List[str] = []
def add(value: str):
value = (value or "").strip()
if value and value not in keys:
keys.append(value)
trimmed = value.rstrip("/")
if trimmed and trimmed not in keys:
keys.append(trimmed)
if trimmed and f"{trimmed}/" not in keys:
keys.append(f"{trimmed}/")
add(raw)
try:
from src.endpoint_resolver import normalize_base
add(normalize_base(raw))
except Exception:
pass
return keys
# Admin tool keywords — if the last user message contains any of these, include admin tools
_ADMIN_KEYWORDS = [
"session", "sessions", "chat", "chats", "conversation", "conversations",
@@ -1456,18 +1496,18 @@ async def stream_agent_loop(
_model_lc = (model or "").lower()
# Step 1: per-endpoint override (set at registration time from the
# serve command — `--enable-auto-tool-choice` flips it on. UI can
# also toggle per endpoint). NULL = unknown, fall through to the
# keyword heuristic + host check.
# also toggle per endpoint). NULL = unknown; for local Ollama /v1 we
# default to fenced tools, otherwise fall through to keyword + host checks.
_endpoint_supports: Optional[bool] = None
try:
from core.database import SessionLocal as _SL, ModelEndpoint as _ME
_db = _SL()
try:
_ep = _db.query(_ME).filter(_ME.base_url == endpoint_url).first()
if not _ep and endpoint_url:
_u = endpoint_url.rstrip("/")
_ep = _db.query(_ME).filter(_ME.base_url == _u).first() or \
_db.query(_ME).filter(_ME.base_url == _u + "/").first()
_ep = None
for _key in _endpoint_lookup_keys(endpoint_url):
_ep = _db.query(_ME).filter(_ME.base_url == _key).first()
if _ep is not None:
break
if _ep is not None:
_endpoint_supports = _ep.supports_tools
finally:
@@ -1503,9 +1543,15 @@ async def stream_agent_loop(
# (via the endpoint settings toggle), treat Ollama-native as text-only so
# the fenced-block path is used instead of native function calling.
_is_ollama_native = _is_ollama_native_url(endpoint_url or "")
_ollama_openai_compat = _is_ollama_openai_compat_url(endpoint_url or "")
if _endpoint_supports is True:
_is_api_model = True
elif _endpoint_supports is False or _model_no_tools or _is_ollama_native:
elif (
_endpoint_supports is False
or _model_no_tools
or _is_ollama_native
or _ollama_openai_compat
):
_is_api_model = False
else:
_is_api_model = any(h in endpoint_url for h in _API_HOSTS) or _model_supports_tools

View File

@@ -1,13 +1,14 @@
"""Regression tests for the tool-support heuristic in stream_agent_loop.
Verifies two critical cases:
1. deepseek-r1 on a local Ollama endpoint must NOT enable native tool schemas
(Ollama returns HTTP 400 for these models when tools are sent).
1. local Ollama endpoints must NOT enable native tool schemas by default
(some models terminate after one token with schemas).
2. api.deepseek.com must still be treated as tool-capable via the host
allow-list (_API_HOSTS), so cloud deepseek users keep working.
"""
import pytest
from src.agent_loop import _API_HOSTS
from src.agent_loop import _API_HOSTS, _endpoint_lookup_keys, _is_ollama_openai_compat_url
from src.llm_core import _is_ollama_native_url
def _compute_is_api_model(model: str, endpoint_url: str, endpoint_supports=None) -> bool:
@@ -28,13 +29,18 @@ def _compute_is_api_model(model: str, endpoint_url: str, endpoint_supports=None)
if endpoint_supports is True:
return True
if endpoint_supports is False or model_no_tools:
if (
endpoint_supports is False
or model_no_tools
or _is_ollama_native_url(endpoint_url)
or _is_ollama_openai_compat_url(endpoint_url)
):
return False
return any(h in endpoint_url for h in _API_HOSTS) or model_supports_tools
class TestDeepSeekToolSupport:
# --- local Ollama cases (must NOT get tool schemas) ---
# --- local Ollama cases (must NOT get native tool schemas by default) ---
def test_deepseek_r1_7b_local_ollama_no_tools(self):
result = _compute_is_api_model(
@@ -56,6 +62,21 @@ class TestDeepSeekToolSupport:
"deepseek-r1:7b", "http://host.docker.internal:11434/v1"
) is False
def test_qwen_local_ollama_defaults_to_fenced_tools(self):
assert _compute_is_api_model(
"qwen3.5:4b", "http://localhost:11434/v1"
) is False
def test_gemma_local_ollama_defaults_to_fenced_tools(self):
assert _compute_is_api_model(
"gemma4:e4b", "http://host.docker.internal:11434/v1"
) is False
def test_qwen_native_ollama_defaults_to_fenced_tools(self):
assert _compute_is_api_model(
"qwen3.5:4b", "http://localhost:11434/api/chat"
) is False
# --- cloud API cases (must still get tool schemas) ---
def test_deepseek_cloud_api_gets_tools(self):
@@ -82,6 +103,20 @@ class TestDeepSeekToolSupport:
)
assert result is True
def test_endpoint_supports_true_overrides_ollama_default(self):
"""A user can still explicitly opt a known-good Ollama endpoint into
native schemas."""
result = _compute_is_api_model(
"qwen3.5:4b", "http://localhost:11434/v1", endpoint_supports=True
)
assert result is True
def test_endpoint_supports_true_overrides_native_ollama_default(self):
result = _compute_is_api_model(
"qwen3.5:4b", "http://localhost:11434/api/chat", endpoint_supports=True
)
assert result is True
def test_endpoint_supports_false_overrides_cloud(self):
"""supports_tools=False on an endpoint gates even cloud APIs."""
result = _compute_is_api_model(
@@ -91,11 +126,11 @@ class TestDeepSeekToolSupport:
# --- other local models unaffected ---
def test_qwen_local_still_gets_tools(self):
assert _compute_is_api_model("qwen2.5:14b", "http://localhost:11434/v1") is True
def test_qwen_local_non_ollama_still_gets_tools(self):
assert _compute_is_api_model("qwen2.5:14b", "http://localhost:8000/v1") is True
def test_llama_local_gets_tools_via_host(self):
assert _compute_is_api_model("llama3.2:3b", "http://localhost:11434/v1") is True
def test_llama_local_non_ollama_gets_tools_via_host(self):
assert _compute_is_api_model("llama3.2:3b", "http://localhost:8000/v1") is True
class TestApiHostsContainsDeepSeek:
@@ -104,3 +139,16 @@ class TestApiHostsContainsDeepSeek:
def test_deepseek_com_in_api_hosts(self):
assert "deepseek.com" in _API_HOSTS
class TestEndpointLookupKeys:
def test_chat_completions_url_matches_endpoint_base(self):
keys = _endpoint_lookup_keys("http://localhost:11434/v1/chat/completions")
assert "http://localhost:11434/v1" in keys
assert "http://localhost:11434/v1/" in keys
def test_native_ollama_chat_url_matches_api_base(self):
keys = _endpoint_lookup_keys("http://host.docker.internal:11434/api/chat")
assert "http://host.docker.internal:11434/api" in keys