Fix Ollama agent single-token responses (#1591)
Agent mode treated local /v1 endpoints, including Ollama on :11434, as native-tool-capable by host/model heuristics. On Ollama's OpenAI-compatible surface some models that advertise tool support stop after a single token when schemas are sent (issue #1567). Default local Ollama /v1 back to fenced tool blocks unless the endpoint explicitly has supports_tools=True. Also compare both the runtime chat URL and the normalized endpoint base when reading ModelEndpoint.supports_tools. That keeps a saved base URL such as http://localhost:11434/v1 effective when the active session URL is /v1/chat/completions. Tests: .venv/bin/python -m pytest tests/test_tool_support_heuristic.py
This commit is contained in:
@@ -13,6 +13,7 @@ import re
|
|||||||
import time
|
import time
|
||||||
import logging
|
import logging
|
||||||
from typing import AsyncGenerator, List, Dict, Optional, Set
|
from typing import AsyncGenerator, List, Dict, Optional, Set
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from src.llm_core import stream_llm, stream_llm_with_fallback, _is_ollama_native_url
|
from src.llm_core import stream_llm, stream_llm_with_fallback, _is_ollama_native_url
|
||||||
from src.model_context import estimate_tokens
|
from src.model_context import estimate_tokens
|
||||||
@@ -475,6 +476,45 @@ _ADMIN_SCHEMA_NAMES = frozenset([
|
|||||||
])
|
])
|
||||||
_TOOL_SELECTION_TIMEOUT_SECONDS = 1.5
|
_TOOL_SELECTION_TIMEOUT_SECONDS = 1.5
|
||||||
|
|
||||||
|
|
||||||
|
def _is_ollama_openai_compat_url(endpoint_url: str) -> bool:
|
||||||
|
"""Return True for local Ollama's OpenAI-compatible /v1 surface.
|
||||||
|
|
||||||
|
Ollama's /v1 endpoint accepts the OpenAI chat shape, but model-level tool
|
||||||
|
streaming is uneven. Some local models terminate after a token when schemas
|
||||||
|
are present. Keep native schemas opt-in via ModelEndpoint.supports_tools.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
parsed = urlparse(endpoint_url or "")
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
path = (parsed.path or "").rstrip("/")
|
||||||
|
return parsed.port == 11434 and (path == "/v1" or path.startswith("/v1/"))
|
||||||
|
|
||||||
|
|
||||||
|
def _endpoint_lookup_keys(endpoint_url: str) -> List[str]:
|
||||||
|
"""Candidate ModelEndpoint.base_url keys for a runtime chat URL."""
|
||||||
|
raw = (endpoint_url or "").strip()
|
||||||
|
keys: List[str] = []
|
||||||
|
|
||||||
|
def add(value: str):
|
||||||
|
value = (value or "").strip()
|
||||||
|
if value and value not in keys:
|
||||||
|
keys.append(value)
|
||||||
|
trimmed = value.rstrip("/")
|
||||||
|
if trimmed and trimmed not in keys:
|
||||||
|
keys.append(trimmed)
|
||||||
|
if trimmed and f"{trimmed}/" not in keys:
|
||||||
|
keys.append(f"{trimmed}/")
|
||||||
|
|
||||||
|
add(raw)
|
||||||
|
try:
|
||||||
|
from src.endpoint_resolver import normalize_base
|
||||||
|
add(normalize_base(raw))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return keys
|
||||||
|
|
||||||
# Admin tool keywords — if the last user message contains any of these, include admin tools
|
# Admin tool keywords — if the last user message contains any of these, include admin tools
|
||||||
_ADMIN_KEYWORDS = [
|
_ADMIN_KEYWORDS = [
|
||||||
"session", "sessions", "chat", "chats", "conversation", "conversations",
|
"session", "sessions", "chat", "chats", "conversation", "conversations",
|
||||||
@@ -1456,18 +1496,18 @@ async def stream_agent_loop(
|
|||||||
_model_lc = (model or "").lower()
|
_model_lc = (model or "").lower()
|
||||||
# Step 1: per-endpoint override (set at registration time from the
|
# Step 1: per-endpoint override (set at registration time from the
|
||||||
# serve command — `--enable-auto-tool-choice` flips it on. UI can
|
# serve command — `--enable-auto-tool-choice` flips it on. UI can
|
||||||
# also toggle per endpoint). NULL = unknown, fall through to the
|
# also toggle per endpoint). NULL = unknown; for local Ollama /v1 we
|
||||||
# keyword heuristic + host check.
|
# default to fenced tools, otherwise fall through to keyword + host checks.
|
||||||
_endpoint_supports: Optional[bool] = None
|
_endpoint_supports: Optional[bool] = None
|
||||||
try:
|
try:
|
||||||
from core.database import SessionLocal as _SL, ModelEndpoint as _ME
|
from core.database import SessionLocal as _SL, ModelEndpoint as _ME
|
||||||
_db = _SL()
|
_db = _SL()
|
||||||
try:
|
try:
|
||||||
_ep = _db.query(_ME).filter(_ME.base_url == endpoint_url).first()
|
_ep = None
|
||||||
if not _ep and endpoint_url:
|
for _key in _endpoint_lookup_keys(endpoint_url):
|
||||||
_u = endpoint_url.rstrip("/")
|
_ep = _db.query(_ME).filter(_ME.base_url == _key).first()
|
||||||
_ep = _db.query(_ME).filter(_ME.base_url == _u).first() or \
|
if _ep is not None:
|
||||||
_db.query(_ME).filter(_ME.base_url == _u + "/").first()
|
break
|
||||||
if _ep is not None:
|
if _ep is not None:
|
||||||
_endpoint_supports = _ep.supports_tools
|
_endpoint_supports = _ep.supports_tools
|
||||||
finally:
|
finally:
|
||||||
@@ -1503,9 +1543,15 @@ async def stream_agent_loop(
|
|||||||
# (via the endpoint settings toggle), treat Ollama-native as text-only so
|
# (via the endpoint settings toggle), treat Ollama-native as text-only so
|
||||||
# the fenced-block path is used instead of native function calling.
|
# the fenced-block path is used instead of native function calling.
|
||||||
_is_ollama_native = _is_ollama_native_url(endpoint_url or "")
|
_is_ollama_native = _is_ollama_native_url(endpoint_url or "")
|
||||||
|
_ollama_openai_compat = _is_ollama_openai_compat_url(endpoint_url or "")
|
||||||
if _endpoint_supports is True:
|
if _endpoint_supports is True:
|
||||||
_is_api_model = True
|
_is_api_model = True
|
||||||
elif _endpoint_supports is False or _model_no_tools or _is_ollama_native:
|
elif (
|
||||||
|
_endpoint_supports is False
|
||||||
|
or _model_no_tools
|
||||||
|
or _is_ollama_native
|
||||||
|
or _ollama_openai_compat
|
||||||
|
):
|
||||||
_is_api_model = False
|
_is_api_model = False
|
||||||
else:
|
else:
|
||||||
_is_api_model = any(h in endpoint_url for h in _API_HOSTS) or _model_supports_tools
|
_is_api_model = any(h in endpoint_url for h in _API_HOSTS) or _model_supports_tools
|
||||||
|
|||||||
@@ -1,13 +1,14 @@
|
|||||||
"""Regression tests for the tool-support heuristic in stream_agent_loop.
|
"""Regression tests for the tool-support heuristic in stream_agent_loop.
|
||||||
|
|
||||||
Verifies two critical cases:
|
Verifies two critical cases:
|
||||||
1. deepseek-r1 on a local Ollama endpoint must NOT enable native tool schemas
|
1. local Ollama endpoints must NOT enable native tool schemas by default
|
||||||
(Ollama returns HTTP 400 for these models when tools are sent).
|
(some models terminate after one token with schemas).
|
||||||
2. api.deepseek.com must still be treated as tool-capable via the host
|
2. api.deepseek.com must still be treated as tool-capable via the host
|
||||||
allow-list (_API_HOSTS), so cloud deepseek users keep working.
|
allow-list (_API_HOSTS), so cloud deepseek users keep working.
|
||||||
"""
|
"""
|
||||||
import pytest
|
import pytest
|
||||||
from src.agent_loop import _API_HOSTS
|
from src.agent_loop import _API_HOSTS, _endpoint_lookup_keys, _is_ollama_openai_compat_url
|
||||||
|
from src.llm_core import _is_ollama_native_url
|
||||||
|
|
||||||
|
|
||||||
def _compute_is_api_model(model: str, endpoint_url: str, endpoint_supports=None) -> bool:
|
def _compute_is_api_model(model: str, endpoint_url: str, endpoint_supports=None) -> bool:
|
||||||
@@ -28,13 +29,18 @@ def _compute_is_api_model(model: str, endpoint_url: str, endpoint_supports=None)
|
|||||||
|
|
||||||
if endpoint_supports is True:
|
if endpoint_supports is True:
|
||||||
return True
|
return True
|
||||||
if endpoint_supports is False or model_no_tools:
|
if (
|
||||||
|
endpoint_supports is False
|
||||||
|
or model_no_tools
|
||||||
|
or _is_ollama_native_url(endpoint_url)
|
||||||
|
or _is_ollama_openai_compat_url(endpoint_url)
|
||||||
|
):
|
||||||
return False
|
return False
|
||||||
return any(h in endpoint_url for h in _API_HOSTS) or model_supports_tools
|
return any(h in endpoint_url for h in _API_HOSTS) or model_supports_tools
|
||||||
|
|
||||||
|
|
||||||
class TestDeepSeekToolSupport:
|
class TestDeepSeekToolSupport:
|
||||||
# --- local Ollama cases (must NOT get tool schemas) ---
|
# --- local Ollama cases (must NOT get native tool schemas by default) ---
|
||||||
|
|
||||||
def test_deepseek_r1_7b_local_ollama_no_tools(self):
|
def test_deepseek_r1_7b_local_ollama_no_tools(self):
|
||||||
result = _compute_is_api_model(
|
result = _compute_is_api_model(
|
||||||
@@ -56,6 +62,21 @@ class TestDeepSeekToolSupport:
|
|||||||
"deepseek-r1:7b", "http://host.docker.internal:11434/v1"
|
"deepseek-r1:7b", "http://host.docker.internal:11434/v1"
|
||||||
) is False
|
) is False
|
||||||
|
|
||||||
|
def test_qwen_local_ollama_defaults_to_fenced_tools(self):
|
||||||
|
assert _compute_is_api_model(
|
||||||
|
"qwen3.5:4b", "http://localhost:11434/v1"
|
||||||
|
) is False
|
||||||
|
|
||||||
|
def test_gemma_local_ollama_defaults_to_fenced_tools(self):
|
||||||
|
assert _compute_is_api_model(
|
||||||
|
"gemma4:e4b", "http://host.docker.internal:11434/v1"
|
||||||
|
) is False
|
||||||
|
|
||||||
|
def test_qwen_native_ollama_defaults_to_fenced_tools(self):
|
||||||
|
assert _compute_is_api_model(
|
||||||
|
"qwen3.5:4b", "http://localhost:11434/api/chat"
|
||||||
|
) is False
|
||||||
|
|
||||||
# --- cloud API cases (must still get tool schemas) ---
|
# --- cloud API cases (must still get tool schemas) ---
|
||||||
|
|
||||||
def test_deepseek_cloud_api_gets_tools(self):
|
def test_deepseek_cloud_api_gets_tools(self):
|
||||||
@@ -82,6 +103,20 @@ class TestDeepSeekToolSupport:
|
|||||||
)
|
)
|
||||||
assert result is True
|
assert result is True
|
||||||
|
|
||||||
|
def test_endpoint_supports_true_overrides_ollama_default(self):
|
||||||
|
"""A user can still explicitly opt a known-good Ollama endpoint into
|
||||||
|
native schemas."""
|
||||||
|
result = _compute_is_api_model(
|
||||||
|
"qwen3.5:4b", "http://localhost:11434/v1", endpoint_supports=True
|
||||||
|
)
|
||||||
|
assert result is True
|
||||||
|
|
||||||
|
def test_endpoint_supports_true_overrides_native_ollama_default(self):
|
||||||
|
result = _compute_is_api_model(
|
||||||
|
"qwen3.5:4b", "http://localhost:11434/api/chat", endpoint_supports=True
|
||||||
|
)
|
||||||
|
assert result is True
|
||||||
|
|
||||||
def test_endpoint_supports_false_overrides_cloud(self):
|
def test_endpoint_supports_false_overrides_cloud(self):
|
||||||
"""supports_tools=False on an endpoint gates even cloud APIs."""
|
"""supports_tools=False on an endpoint gates even cloud APIs."""
|
||||||
result = _compute_is_api_model(
|
result = _compute_is_api_model(
|
||||||
@@ -91,11 +126,11 @@ class TestDeepSeekToolSupport:
|
|||||||
|
|
||||||
# --- other local models unaffected ---
|
# --- other local models unaffected ---
|
||||||
|
|
||||||
def test_qwen_local_still_gets_tools(self):
|
def test_qwen_local_non_ollama_still_gets_tools(self):
|
||||||
assert _compute_is_api_model("qwen2.5:14b", "http://localhost:11434/v1") is True
|
assert _compute_is_api_model("qwen2.5:14b", "http://localhost:8000/v1") is True
|
||||||
|
|
||||||
def test_llama_local_gets_tools_via_host(self):
|
def test_llama_local_non_ollama_gets_tools_via_host(self):
|
||||||
assert _compute_is_api_model("llama3.2:3b", "http://localhost:11434/v1") is True
|
assert _compute_is_api_model("llama3.2:3b", "http://localhost:8000/v1") is True
|
||||||
|
|
||||||
|
|
||||||
class TestApiHostsContainsDeepSeek:
|
class TestApiHostsContainsDeepSeek:
|
||||||
@@ -104,3 +139,16 @@ class TestApiHostsContainsDeepSeek:
|
|||||||
|
|
||||||
def test_deepseek_com_in_api_hosts(self):
|
def test_deepseek_com_in_api_hosts(self):
|
||||||
assert "deepseek.com" in _API_HOSTS
|
assert "deepseek.com" in _API_HOSTS
|
||||||
|
|
||||||
|
|
||||||
|
class TestEndpointLookupKeys:
|
||||||
|
def test_chat_completions_url_matches_endpoint_base(self):
|
||||||
|
keys = _endpoint_lookup_keys("http://localhost:11434/v1/chat/completions")
|
||||||
|
|
||||||
|
assert "http://localhost:11434/v1" in keys
|
||||||
|
assert "http://localhost:11434/v1/" in keys
|
||||||
|
|
||||||
|
def test_native_ollama_chat_url_matches_api_base(self):
|
||||||
|
keys = _endpoint_lookup_keys("http://host.docker.internal:11434/api/chat")
|
||||||
|
|
||||||
|
assert "http://host.docker.internal:11434/api" in keys
|
||||||
|
|||||||
Reference in New Issue
Block a user