diff --git a/src/tool_index.py b/src/tool_index.py index 5e27e1a..506e55d 100644 --- a/src/tool_index.py +++ b/src/tool_index.py @@ -293,7 +293,11 @@ class ToolIndex: # Keyword hints: if the query mentions these words, force-include the tools. _KEYWORD_HINTS = { - frozenset({"email", "mail", "gmail", "googlemail", "message", "send", "reply", "inbox", "unread", "tell"}): + # NOTE: "tell" was removed from this set. It fired on any "tell me ..." + # request (e.g. "visit and tell me the title"), force-including the + # whole email toolset and crowding out the relevant tools — the model then + # believed it had only email tools and refused web/other tasks (#1707). + frozenset({"email", "mail", "gmail", "googlemail", "message", "send", "reply", "inbox", "unread"}): {"list_email_accounts", "list_emails", "read_email", "send_email", "reply_to_email", "bulk_email", "delete_email", "archive_email", "mark_email_read", "resolve_contact", "ui_control"}, frozenset({"calendar", "event", "meeting", "schedule", "appointment"}): {"manage_calendar"}, diff --git a/tests/test_tool_rag_keyword_hints.py b/tests/test_tool_rag_keyword_hints.py new file mode 100644 index 0000000..5a6f978 --- /dev/null +++ b/tests/test_tool_rag_keyword_hints.py @@ -0,0 +1,57 @@ +"""Regression for issue #1707 — the agent tool-RAG force-included the entire +email toolset on any "tell me ..." query, crowding out the relevant tools so the +model believed it only had email tools and refused web/other tasks. + +Root cause: `_KEYWORD_HINTS` in src/tool_index.py listed "tell" under the email +intent, and `get_tools_for_query` force-includes a hint's tools whenever any of +its keywords appears (word-boundary match). "tell" appears in a huge fraction of +requests (the reporter's was "visit and tell me the title"), so email tools +were force-included for non-email queries. + +These hints are deterministic string matching — no embeddings — so we can test +`get_tools_for_query` directly with retrieval stubbed out (no ChromaDB needed). +""" + +from src.tool_index import ToolIndex, ALWAYS_AVAILABLE + +_EMAIL_TOOLS = { + "list_emails", "read_email", "send_email", "reply_to_email", + "bulk_email", "delete_email", "archive_email", "mark_email_read", +} + + +def _index_without_embeddings(): + """A ToolIndex whose retrieval returns nothing, so get_tools_for_query + exercises only the deterministic base + keyword-hint logic.""" + ti = ToolIndex.__new__(ToolIndex) # skip __init__ (no ChromaDB/fastembed) + ti.retrieve = lambda query, k=8: [] + return ti + + +def test_tell_in_web_query_does_not_force_email_tools(): + """The #1707 repro: a web request that merely contains the word 'tell' must + NOT drag in the email toolset.""" + ti = _index_without_embeddings() + q = "visit https://www.youtube.com/user/PewDiePie and tell me the title of his latest video" + tools = ti.get_tools_for_query(q) + leaked = _EMAIL_TOOLS & tools + assert not leaked, f"'tell me' must not force-include email tools, got {sorted(leaked)}" + # web_search / web_fetch are always-available and must remain present. + assert "web_search" in tools and "web_fetch" in tools + + +def test_genuine_email_query_still_gets_email_tools(): + """Removing 'tell' must not break real email intent — the actual email + keywords still force-include the toolset.""" + ti = _index_without_embeddings() + tools = ti.get_tools_for_query("reply to the unread email in my inbox") + assert {"reply_to_email", "send_email", "read_email"} <= tools + + +def test_plain_tell_request_stays_minimal(): + """A bare 'tell me a joke' must not pull in email tools either.""" + ti = _index_without_embeddings() + tools = ti.get_tools_for_query("tell me a joke") + assert not (_EMAIL_TOOLS & tools) + # Always-available baseline is still there. + assert set(ALWAYS_AVAILABLE) <= tools