From 5b1e56407b355517d321456a91851b6906fd1ffe Mon Sep 17 00:00:00 2001 From: Rifqi Akram <35358522+rifqiakrm@users.noreply.github.com> Date: Mon, 1 Jun 2026 14:57:28 +0700 Subject: [PATCH] Add SSRF-guarded web fetch agent tool * feat(web-fetch): add web_fetch tool to read a specific URL's content * test(web-fetch): add SSRF coverage and fail closed on empty DNS resolution Add explicit SSRF regression tests for the web_fetch path covering loopback, private LAN ranges, link-local/metadata, IPv6 private/local, redirect-into-private, and unsupported schemes. Harden _public_http_url to fail closed when a hostname resolves to no addresses. --- routes/chat_routes.py | 3 +- src/agent_loop.py | 6 +++ src/agent_tools.py | 2 +- src/search/content.py | 34 ++++++++++--- src/settings.py | 1 + src/task_scheduler.py | 2 +- src/tool_execution.py | 55 +++++++++++++++++++++ src/tool_index.py | 3 +- src/tool_parsing.py | 6 +++ src/tool_schemas.py | 14 ++++++ tests/test_security_regressions.py | 76 ++++++++++++++++++++++++++++++ 11 files changed, 192 insertions(+), 10 deletions(-) diff --git a/routes/chat_routes.py b/routes/chat_routes.py index e984bcb..3cdcb85 100644 --- a/routes/chat_routes.py +++ b/routes/chat_routes.py @@ -389,6 +389,7 @@ def setup_chat_routes( disabled_tools.add("bash") if str(allow_web_search).lower() != "true": disabled_tools.add("web_search") + disabled_tools.add("web_fetch") # Nobody/incognito mode: deny tools that would expose the user's # persistent memory, past chats, or other identity-linked data. @@ -452,7 +453,7 @@ def setup_chat_routes( disabled_tools.update(_compare_strip) # In chat mode compare, disable ALL agent tools (no bash, python, file ops) if chat_mode == 'chat': - disabled_tools.update({"bash", "python", "read_file", "write_file", "web_search", "search_chats", "manage_tasks"}) + disabled_tools.update({"bash", "python", "read_file", "write_file", "web_search", "web_fetch", "search_chats", "manage_tasks"}) async def stream_with_save() -> AsyncGenerator[str, None]: # _effective_mode is read-only here; closure captures it from diff --git a/src/agent_loop.py b/src/agent_loop.py index 000aefc..40aa1b1 100644 --- a/src/agent_loop.py +++ b/src/agent_loop.py @@ -199,6 +199,12 @@ Or with JSON for fresh news: ``` Search the web for a SINGLE quick fact/lookup mid-task. For news / "today" / "latest" queries, pass `time_filter` ("day", "week", "month", or "year"). NOT for "research X" / "do research on X" / "look into X" requests — those mean a multi-source DEEP RESEARCH job: use `trigger_research` instead (it runs in the Deep Research sidebar and produces a full report). web_search = one quick query; trigger_research = a researched report.""", + "web_fetch": """\ +```web_fetch + +``` +Fetch and read the text content of a SPECIFIC URL the user names (e.g. "check example.com", "what does this page say "). A bare domain like `example.com` works (defaults to https). Use this when you already have a concrete URL. For open-ended lookups use `web_search`, and for "research X" jobs use `trigger_research`.""", + "read_file": """\ ```read_file diff --git a/src/agent_tools.py b/src/agent_tools.py index 2277407..9a54ab8 100644 --- a/src/agent_tools.py +++ b/src/agent_tools.py @@ -26,7 +26,7 @@ MAX_OUTPUT_CHARS = 10_000 MAX_READ_CHARS = 20_000 # Tool types that trigger execution -TOOL_TAGS = {"bash", "python", "web_search", "read_file", "write_file", +TOOL_TAGS = {"bash", "python", "web_search", "web_fetch", "read_file", "write_file", "create_document", "update_document", "edit_document", "search_chats", "chat_with_model", "create_session", "list_sessions", diff --git a/src/search/content.py b/src/search/content.py index 2420154..1c469e8 100644 --- a/src/search/content.py +++ b/src/search/content.py @@ -1,5 +1,6 @@ """Webpage content fetching with caching, PDF extraction, and summarization helpers.""" +import copy import io import ipaddress import json @@ -61,9 +62,12 @@ def _public_http_url(url: str) -> bool: except ValueError: pass try: - return all(not _is_private_address(ip) for ip in _resolve_hostname_ips(host)) + ips = _resolve_hostname_ips(host) except OSError: return False + # Fail closed: a hostname that resolves to nothing is treated as + # non-public (an empty all(...) would otherwise return True). + return bool(ips) and all(not _is_private_address(ip) for ip in ips) def _get_public_url(url: str, *, headers: dict, timeout: int) -> httpx.Response: @@ -297,7 +301,8 @@ def fetch_webpage_content(url: str, timeout: int = 5, retry_attempt: int = 0) -> js_rendered = _detect_js_frameworks(soup) js_message = "Page appears to be rendered by a JavaScript framework; content may be incomplete." if js_rendered else "" - # Main textual content (heuristic) + # Main textual content (heuristic): prefer semantic / "content"-classed + # containers to skip nav/footer/boilerplate; tuned for article pages. main_content = "" content_areas = soup.find_all( ["main", "article", "section", "div"], @@ -306,12 +311,29 @@ def fetch_webpage_content(url: str, timeout: int = 5, retry_attempt: int = 0) -> if content_areas: for area in content_areas[:3]: main_content += area.get_text(separator=" ", strip=True) + " " - if not main_content: + main_content = re.sub(r"\s+", " ", main_content).strip() + + # The class heuristic can latch onto a small wrapper and miss the real + # content (app/landing pages, or SSR sites whose body isn't in a + # "content"-classed div, so these came back nearly empty before). When the + # heuristic returns nothing OR suspiciously little, fall back to the full + # , stripping scripts/styles (so JSON/JS doesn't leak into the text) + # plus nav/header/footer/aside (boilerplate), and keep whichever yields + # more readable text. + THIN_CONTENT_CHARS = 600 # below this the heuristic likely missed the page + if len(main_content) < THIN_CONTENT_CHARS: body = soup.find("body") if body: - main_content = body.get_text(separator=" ", strip=True) - - main_content = re.sub(r"\s+", " ", main_content).strip() + # Strip from a copy so the later list/table/code extractors still + # see the original soup unmodified. + body_copy = copy.copy(body) + for _noise in body_copy.find_all( + ["script", "style", "noscript", "template", "nav", "header", "footer", "aside"] + ): + _noise.extract() + body_text = re.sub(r"\s+", " ", body_copy.get_text(separator=" ", strip=True)).strip() + if len(body_text) > len(main_content): + main_content = body_text result = { "url": url, diff --git a/src/settings.py b/src/settings.py index 7da1e73..76af61a 100644 --- a/src/settings.py +++ b/src/settings.py @@ -122,6 +122,7 @@ DEFAULT_SETTINGS = { DEFAULT_FEATURES = { "web_search": True, + "web_fetch": True, "deep_research": False, "memory": True, "document_editor": True, diff --git a/src/task_scheduler.py b/src/task_scheduler.py index 4268b96..bb1341a 100644 --- a/src/task_scheduler.py +++ b/src/task_scheduler.py @@ -2059,7 +2059,7 @@ class TaskScheduler: "manage_calendar", "manage_notes", "manage_tasks", "manage_memory", "list_email_accounts", "list_emails", "read_email", "send_email", "reply_to_email", "archive_email", "mark_email_read", "delete_email", "resolve_contact", - "search_chats", "web_search", "read_file", + "search_chats", "web_search", "web_fetch", "read_file", "create_document", "update_document", "edit_document", "generate_image", "trigger_research", "download_model", "serve_model", "list_served_models", "stop_served_model", diff --git a/src/tool_execution.py b/src/tool_execution.py index 21ab553..e0a04d2 100644 --- a/src/tool_execution.py +++ b/src/tool_execution.py @@ -195,6 +195,7 @@ _MCP_TOOL_MAP = { "read_file": ("filesystem", "read_file"), "write_file": ("filesystem", "write_file"), "web_search": ("web_search", "web_search"), + "web_fetch": ("web_fetch", "web_fetch"), "generate_image": ("image_gen", "generate_image"), } @@ -238,6 +239,7 @@ _MCP_ARG_PARSERS: Dict[str, callable] = { "bash": lambda c: {"command": c}, "python": lambda c: {"code": c}, "web_search": lambda c: {"query": c.split("\n")[0].strip()}, + "web_fetch": lambda c: {"url": c.split("\n")[0].strip()}, "read_file": lambda c: {"path": c.split("\n")[0].strip()}, "write_file": _parse_write_file, "generate_image": _parse_generate_image, @@ -464,6 +466,59 @@ async def _direct_fallback( output += "\n\n" return {"output": output, "exit_code": 0} + if tool == "web_fetch": + # Lightweight single-URL fetch. Wraps the SSRF-safe fetcher used + # by deep research, so private/loopback/metadata addresses are + # already blocked there. + from src.search.content import fetch_webpage_content + raw = content.strip() + url = "" + # Accept either a JSON arg ({"url": "..."}) or a plain URL/domain. + if raw.startswith("{"): + try: + parsed = _json.loads(raw) + if isinstance(parsed, dict): + url = str(parsed.get("url") or "").strip() + except _json.JSONDecodeError: + url = "" + if not url: + # Non-JSON (or JSON without a usable url): take the first line + # only, so a URL followed by commentary still parses. + url = raw.split("\n")[0].strip() + # Reject anything that isn't a single bare URL/domain token. + if not url or url.startswith("{") or any(c in url for c in (" ", "\t", "\n")): + return {"error": "web_fetch: provide a single URL or domain, e.g. example.com", "exit_code": 1} + low = url.lower() + if "://" in low and not low.startswith(("http://", "https://")): + return {"error": f"web_fetch: unsupported URL scheme (only http/https): {url[:80]}", "exit_code": 1} + # Accept bare domains like "example.com" by defaulting to https. + if not low.startswith(("http://", "https://")): + url = "https://" + url + loop = asyncio.get_running_loop() + try: + result = await asyncio.wait_for( + loop.run_in_executor(None, lambda: fetch_webpage_content(url, timeout=10)), + timeout=30, + ) + except asyncio.TimeoutError: + return {"error": f"web_fetch: timed out fetching {url}", "exit_code": 1} + err = result.get("error") + text = (result.get("content") or "").strip() + title = result.get("title") or "" + + if not text: + if err: + return {"error": f"web_fetch: {url}: {err}", "exit_code": 1} + # No extractable text: non-HTML body, or a pure client-rendered + # shell. The agent can fall back to the builtin_browser tool. + return {"error": f"web_fetch: {url}: no readable text content (not HTML, or the page needs JS/login)", "exit_code": 1} + + header = (f"# {title}\n" if title else "") + f"Source: {url}\n\n" + output = header + text + if len(output) > MAX_OUTPUT_CHARS: + output = output[:MAX_OUTPUT_CHARS] + "\n\n[...truncated]" + return {"output": output, "exit_code": 0} + # manage_memory / generate_image still live as MCP servers # (mcp_servers/{memory,image_gen}_server.py); the MCP path above # handles them. diff --git a/src/tool_index.py b/src/tool_index.py index 32a9ca1..f8e8fae 100644 --- a/src/tool_index.py +++ b/src/tool_index.py @@ -22,7 +22,7 @@ logger = logging.getLogger(__name__) # Tools that are ALWAYS included regardless of retrieval results. # These are the most commonly needed and should never be missing. ALWAYS_AVAILABLE = frozenset({ - "bash", "python", "web_search", "read_file", + "bash", "python", "web_search", "web_fetch", "read_file", "api_call", # For configured integrations (Miniflux, Gitea, Linkding, etc.) # The two genuinely AMBIENT cookbook tools — "what's running" and # "kill it" can be asked any time without prior cookbook context, @@ -62,6 +62,7 @@ BUILTIN_TOOL_DESCRIPTIONS: Dict[str, str] = { "bash": "Run shell commands on the server. Install packages, check files, git operations, curl, system info, process management, networking.", "python": "Execute Python code for computation, data processing, math, scripting, parsing, API calls. Not for writing code for the user.", "web_search": "Quick single web lookup for a fact, current event, or doc mid-task. NOT for 'research X' / 'do research on X' requests — those are deep-research jobs (use trigger_research). web_search = one query; trigger_research = a full researched report in the sidebar.", + "web_fetch": "Fetch and read the text content of a specific URL/website the user names (e.g. 'check example.com', 'open this link'). Use when you have a concrete URL; for open-ended lookups use web_search instead.", "read_file": "Read a file from disk and return its contents. View source code, config files, logs.", "write_file": "Write content to a file on disk. Create new files, save output, update configs.", "create_document": "Create a new document in the editor panel. For code, articles, text content longer than 15 lines. Specify title, language, and content.", diff --git a/src/tool_parsing.py b/src/tool_parsing.py index 6b39786..6d7aae3 100644 --- a/src/tool_parsing.py +++ b/src/tool_parsing.py @@ -95,6 +95,10 @@ _TOOL_NAME_MAP = { "search": "web_search", "web_search": "web_search", "websearch": "web_search", + "web_fetch": "web_fetch", + "webfetch": "web_fetch", + "fetch_url": "web_fetch", + "fetch": "web_fetch", "read": "read_file", "read_file": "read_file", "cat": "read_file", @@ -305,6 +309,8 @@ def _parse_tool_code_block(raw: str) -> Optional[ToolBlock]: content = xml_params.get("code", args_body) elif mapped == "web_search": content = xml_params.get("query", args_body) + elif mapped == "web_fetch": + content = xml_params.get("url", args_body) elif mapped in ("read_file", "write_file"): content = xml_params.get("path", xml_params.get("file_path", args_body)) else: diff --git a/src/tool_schemas.py b/src/tool_schemas.py index 619ce4f..f0a69e0 100644 --- a/src/tool_schemas.py +++ b/src/tool_schemas.py @@ -64,6 +64,20 @@ FUNCTION_TOOL_SCHEMAS = [ } } }, + { + "type": "function", + "function": { + "name": "web_fetch", + "description": "Fetch and read the text content of a specific URL the user names (e.g. 'check example.com', 'what's on this page '). Use when you already have a concrete URL/domain. NOT for open-ended searches (use web_search) or 'research X' jobs (use trigger_research).", + "parameters": { + "type": "object", + "properties": { + "url": {"type": "string", "description": "The URL or domain to fetch (http/https; a bare domain like example.com is fine)"} + }, + "required": ["url"] + } + } + }, { "type": "function", "function": { diff --git a/tests/test_security_regressions.py b/tests/test_security_regressions.py index be3f8ae..59e6f68 100644 --- a/tests/test_security_regressions.py +++ b/tests/test_security_regressions.py @@ -546,3 +546,79 @@ def test_mcp_config_listing_is_admin_gated(): assert "def list_servers(request: Request):" in src assert "def list_tools(request: Request):" in src assert "def list_server_tools(server_id: str, request: Request):" in src + + +# ── web_fetch SSRF guard (PR #111 merge gate) ─────────────────────── +# web_fetch routes every request through src.search.content's +# _public_http_url / _get_public_url, the same SSRF-safe fetcher used by +# web_search and deep research. These pin that the guard blocks every +# private/internal address class plus redirect-into-private and non-http +# schemes, so the new tool can't be turned into an SSRF primitive. + +import ipaddress as _ipaddr + +import pytest as _pytest + + +@_pytest.mark.parametrize("url", [ + "http://127.0.0.1/", # IPv4 loopback + "http://localhost/", # loopback by name + "http://10.0.0.5/", # private LAN 10/8 + "http://172.16.0.1/", # private LAN 172.16/12 + "http://192.168.1.1/", # private LAN 192.168/16 + "http://169.254.169.254/latest/", # link-local / cloud metadata + "http://metadata.google.internal/", # metadata by name + "http://[::1]/", # IPv6 loopback + "http://[fc00::1]/", # IPv6 unique-local (ULA) + "http://[fe80::1]/", # IPv6 link-local + "file:///etc/passwd", # unsupported scheme + "ftp://example.com/", # unsupported scheme +]) +def test_web_fetch_guard_blocks_private_and_bad_schemes(url): + from src.search.content import _public_http_url + assert _public_http_url(url) is False + + +def test_web_fetch_guard_allows_public_ip(): + from src.search.content import _public_http_url + assert _public_http_url("http://93.184.216.34/") is True + + +def test_web_fetch_guard_blocks_dns_resolving_to_private(monkeypatch): + from src.search import content + monkeypatch.setattr(content, "_resolve_hostname_ips", + lambda host: [_ipaddr.ip_address("10.0.0.5")]) + assert content._public_http_url("https://innocent.example/") is False + + +def test_web_fetch_guard_fails_closed_on_empty_resolution(monkeypatch): + # A hostname that resolves to nothing must be treated as non-public. + from src.search import content + monkeypatch.setattr(content, "_resolve_hostname_ips", lambda host: []) + assert content._public_http_url("https://innocent.example/") is False + + +def test_web_fetch_guard_blocks_redirect_into_private(monkeypatch): + # A public URL that 302-redirects to an internal address must be blocked + # at the redirect hop, not followed. + import httpx + from src.search import content + + monkeypatch.setattr(content, "_resolve_hostname_ips", + lambda host: [_ipaddr.ip_address("93.184.216.34")]) + + class _Resp: + status_code = 302 + headers = {"location": "http://169.254.169.254/latest/meta-data/"} + + class _FakeClient: + def __init__(self, *a, **k): pass + def __enter__(self): return self + def __exit__(self, *a): return False + def get(self, url): return _Resp() + + monkeypatch.setattr(httpx, "Client", _FakeClient) + + with _pytest.raises(httpx.RequestError) as exc: + content._get_public_url("http://public.example/start", headers={}, timeout=5) + assert "non-public" in str(exc.value)