From cb6f6b65ea574cad538771f89592c0ad39e83c2e Mon Sep 17 00:00:00 2001
From: Ernest Hysa <takis312@hotmail.com>
Date: Mon, 1 Jun 2026 23:25:38 +0100
Subject: [PATCH] fix(research): validate session_id to block path traversal
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Every research endpoint interpolates session_id into filesystem paths
(Path('data/deep_research') / f'{session_id}.json') without checking
for traversal sequences. A crafted ID like '../../data/auth' reaches
arbitrary JSON files — readable via research_detail (which also leaks
file paths in error messages), writable via research_archive, and
deletable via research_delete.

Add _validate_session_id() which rejects anything outside
[a-zA-Z0-9-]{1,128}. Called before filesystem access in all 12
endpoints that accept a session_id path parameter.
---
 routes/research_routes.py                    | 19 +++++++
 tests/test_research_session_id_validation.py | 55 ++++++++++++++++++++
 2 files changed, 74 insertions(+)
 create mode 100644 tests/test_research_session_id_validation.py

diff --git a/routes/research_routes.py b/routes/research_routes.py
index 4def1dd..a04f762 100644
--- a/routes/research_routes.py
+++ b/routes/research_routes.py
@@ -3,6 +3,7 @@
 import asyncio
 import json
 import logging
+import re
 import uuid
 from datetime import datetime
 from pathlib import Path
@@ -14,6 +15,8 @@ from pydantic import BaseModel, Field
 from src.endpoint_resolver import resolve_endpoint
 from src.auth_helpers import get_current_user
 
+_SESSION_ID_RE = re.compile(r"^[a-zA-Z0-9-]{1,128}$")
+
 logger = logging.getLogger(__name__)
 
 # Model-name substrings that are NOT chat/generation models — research must
@@ -58,6 +61,10 @@ def setup_research_routes(research_handler, session_manager=None) -> APIRouter:
             raise HTTPException(401, "Not authenticated")
         return user
 
+    def _validate_session_id(session_id: str) -> None:
+        if not _SESSION_ID_RE.fullmatch(session_id):
+            raise HTTPException(400, "Invalid session ID format")
+
     def _owns_in_memory(session_id: str, user: str) -> bool:
         """Ownership check for an in-flight (in-memory) research task.
         Falls back to the on-disk JSON if the task has already finished."""
@@ -95,6 +102,7 @@ def setup_research_routes(research_handler, session_manager=None) -> APIRouter:
     @router.get("/api/research/status/{session_id}")
     async def research_status(session_id: str, request: Request):
         user = _require_user(request)
+        _validate_session_id(session_id)
         if not _owns_in_memory(session_id, user):
             raise HTTPException(404, "No research found for this session")
         status = research_handler.get_status(session_id)
@@ -105,6 +113,7 @@ def setup_research_routes(research_handler, session_manager=None) -> APIRouter:
     @router.post("/api/research/cancel/{session_id}")
     async def research_cancel(session_id: str, request: Request):
         user = _require_user(request)
+        _validate_session_id(session_id)
         if not _owns_in_memory(session_id, user):
             raise HTTPException(404, "No research found for this session")
         cancelled = research_handler.cancel_research(session_id)
@@ -113,6 +122,7 @@ def setup_research_routes(research_handler, session_manager=None) -> APIRouter:
     @router.post("/api/research/result/{session_id}")
     async def research_result(session_id: str, request: Request):
         user = _require_user(request)
+        _validate_session_id(session_id)
         if not _owns_in_memory(session_id, user):
             raise HTTPException(404, "No research result available")
         result = research_handler.get_result(session_id)
@@ -140,6 +150,7 @@ def setup_research_routes(research_handler, session_manager=None) -> APIRouter:
     async def research_report(session_id: str, request: Request):
         """Serve the visual HTML report for a completed research session."""
         user = _require_user(request)
+        _validate_session_id(session_id)
         _assert_owns_research(session_id, user)
         logger.info(f"Visual report requested for session {session_id}")
         try:
@@ -160,6 +171,7 @@ def setup_research_routes(research_handler, session_manager=None) -> APIRouter:
         """Mark an image URL as hidden for this research's visual report.
         Persisted to the research JSON so subsequent /report renders skip it."""
         user = _require_user(request)
+        _validate_session_id(session_id)
         _assert_owns_research(session_id, user)
         ok = research_handler.hide_image(session_id, body.url)
         if not ok:
@@ -170,6 +182,7 @@ def setup_research_routes(research_handler, session_manager=None) -> APIRouter:
     async def research_unhide_images(session_id: str, request: Request):
         """Clear the hidden-images list for a research session."""
         user = _require_user(request)
+        _validate_session_id(session_id)
         _assert_owns_research(session_id, user)
         ok = research_handler.unhide_all_images(session_id)
         if not ok:
@@ -235,6 +248,7 @@ def setup_research_routes(research_handler, session_manager=None) -> APIRouter:
         """Return the full JSON for a single research result — sources,
         summary, stats — used by the Library preview panel."""
         user = _require_user(request)
+        _validate_session_id(session_id)
         path = Path("data/deep_research") / f"{session_id}.json"
         if not path.exists():
             raise HTTPException(404, "Research not found")
@@ -251,6 +265,7 @@ def setup_research_routes(research_handler, session_manager=None) -> APIRouter:
     async def research_archive(session_id: str, request: Request, archived: bool = Query(True)):
         """Soft-archive / restore a research report (sets `archived` in its JSON)."""
         user = _require_user(request)
+        _validate_session_id(session_id)
         path = Path("data/deep_research") / f"{session_id}.json"
         if not path.exists():
             raise HTTPException(404, "Research not found")
@@ -270,6 +285,7 @@ def setup_research_routes(research_handler, session_manager=None) -> APIRouter:
     async def research_delete(session_id: str, request: Request):
         """Delete a research result from disk."""
         user = _require_user(request)
+        _validate_session_id(session_id)
         data_dir = Path("data/deep_research")
         json_path = data_dir / f"{session_id}.json"
         deleted = False
@@ -413,6 +429,7 @@ def setup_research_routes(research_handler, session_manager=None) -> APIRouter:
     async def research_stream(session_id: str, request: Request):
         """SSE stream of research progress events."""
         user = _require_user(request)
+        _validate_session_id(session_id)
         if not _owns_in_memory(session_id, user):
             raise HTTPException(404, "No research found for this session")
         async def _generate():
@@ -446,6 +463,7 @@ def setup_research_routes(research_handler, session_manager=None) -> APIRouter:
     async def research_result_peek(session_id: str, request: Request):
         """Get research result without clearing it (for panel use)."""
         user = _require_user(request)
+        _validate_session_id(session_id)
         if not _owns_in_memory(session_id, user):
             raise HTTPException(404, "No research found for this session")
         result = research_handler.get_result(session_id)
@@ -475,6 +493,7 @@ def setup_research_routes(research_handler, session_manager=None) -> APIRouter:
         the user can ask follow-up questions in a clean conversation.
         """
         _require_user(request)
+        _validate_session_id(session_id)
         if session_manager is None:
             raise HTTPException(500, "session_manager not configured")
 
diff --git a/tests/test_research_session_id_validation.py b/tests/test_research_session_id_validation.py
new file mode 100644
index 0000000..499b72a
--- /dev/null
+++ b/tests/test_research_session_id_validation.py
@@ -0,0 +1,55 @@
+"""Regression tests: research session_id must reject path-traversal sequences."""
+
+import re
+import unittest
+
+_SESSION_ID_RE = re.compile(r"^[a-zA-Z0-9-]{1,128}$")
+
+
+class TestResearchSessionIdValidation(unittest.TestCase):
+    """Validate the regex used to guard research session_id path params."""
+
+    def test_accepts_rp_prefixed_id(self):
+        self.assertIsNotNone(_SESSION_ID_RE.fullmatch("rp-abc123def456"))
+
+    def test_accepts_standard_uuid(self):
+        self.assertIsNotNone(
+            _SESSION_ID_RE.fullmatch("550e8400-e29b-41d4-a716-446655440000")
+        )
+
+    def test_accepts_custom_alphanumeric(self):
+        self.assertIsNotNone(_SESSION_ID_RE.fullmatch("custom-id-123"))
+
+    def test_rejects_double_dot(self):
+        self.assertIsNone(_SESSION_ID_RE.fullmatch(".."))
+
+    def test_rejects_single_dot(self):
+        self.assertIsNone(_SESSION_ID_RE.fullmatch("."))
+
+    def test_rejects_dot_slash_traversal(self):
+        self.assertIsNone(_SESSION_ID_RE.fullmatch("../../data/auth"))
+
+    def test_rejects_deep_traversal(self):
+        self.assertIsNone(_SESSION_ID_RE.fullmatch("../../../etc/passwd"))
+
+    def test_rejects_mixed_traversal(self):
+        self.assertIsNone(_SESSION_ID_RE.fullmatch("normal/../../traversal"))
+
+    def test_rejects_dot_prefix_traversal(self):
+        self.assertIsNone(_SESSION_ID_RE.fullmatch("./../../secret"))
+
+    def test_rejects_empty(self):
+        self.assertIsNone(_SESSION_ID_RE.fullmatch(""))
+
+    def test_rejects_whitespace(self):
+        self.assertIsNone(_SESSION_ID_RE.fullmatch(" "))
+
+    def test_rejects_slash(self):
+        self.assertIsNone(_SESSION_ID_RE.fullmatch("a/b"))
+
+    def test_rejects_null_byte(self):
+        self.assertIsNone(_SESSION_ID_RE.fullmatch("rp-test\x00"))
+
+
+if __name__ == "__main__":
+    unittest.main()