From c3228f8b5960f1fb0f9ba7e7d88fd9cfcd0c6baa Mon Sep 17 00:00:00 2001
From: Tushar-Projects <204472109+Tushar-Projects@users.noreply.github.com>
Date: Tue, 2 Jun 2026 17:27:42 +0530
Subject: [PATCH] Background tasks: respect active session model fallback

---
 routes/chat_helpers.py              |  3 +++
 services/memory/memory_extractor.py | 27 +++++++++++++++++++++++++--
 services/memory/skill_extractor.py  | 20 +++++++++++++++++++-
 src/endpoint_resolver.py            | 11 ++++++++---
 4 files changed, 55 insertions(+), 6 deletions(-)
diff --git a/routes/chat_helpers.py b/routes/chat_helpers.py
index 7e7a764..06d886d 100644
--- a/routes/chat_helpers.py
+++ b/routes/chat_helpers.py
@@ -149,6 +149,9 @@ async def auto_name_session(session_manager, sess):
         t_url, t_model, t_headers = resolve_task_endpoint(
             sess.endpoint_url, sess.model, sess.headers,
         )
+        if not t_model:
+            logger.debug("[auto-name] No model provided, skipping")
+            return
 
         # max_tokens big enough that reasoning models (Minimax M2,
         # DeepSeek R1, QwQ, etc.) have headroom for <think>…</think>
diff --git a/services/memory/memory_extractor.py b/services/memory/memory_extractor.py
index 0f82ba8..c994bb9 100644
--- a/services/memory/memory_extractor.py
+++ b/services/memory/memory_extractor.py
@@ -235,6 +235,10 @@ async def extract_and_store(
     Designed to run as a background task (asyncio.create_task).
     Errors are logged, never raised.
     """
+    if not endpoint_url or not model:
+        logger.debug("[memory-extract] No model or URL provided, skipping")
+        return
+
     try:
         from src.llm_core import llm_call_async
 
@@ -245,11 +249,30 @@ async def extract_and_store(
         if len(recent) < 2:
             return  # Need at least a user message and assistant response
 
-        fallback_facts = _fallback_memory_candidates(recent)
+        # Strip media (images/audio) from messages — background memory extraction
+        # only needs the text. The VL-generated descriptions are already in the
+        # text content of the messages. This avoids sending image tokens to
+        # non-vision models and prevents accidental "vision grounding" triggers.
+        stripped_recent = []
+        for msg in recent:
+            role = msg.get("role")
+            content = msg.get("content", "")
+            if isinstance(content, list):
+                # Filter out multimodal blocks that aren't text
+                text_only = [b for b in content if isinstance(b, dict) and b.get("type") == "text"]
+                if not text_only and content:
+                    continue
+                content = text_only
+            stripped_recent.append({"role": role, "content": content})
+
+        if not stripped_recent:
+            return
+
+        fallback_facts = _fallback_memory_candidates(stripped_recent)
 
         extraction_messages = [
             {"role": "system", "content": EXTRACT_SYSTEM_PROMPT},
-        ] + recent
+        ] + stripped_recent
 
         facts = []
         try:
diff --git a/services/memory/skill_extractor.py b/services/memory/skill_extractor.py
index e0f3e3d..4e267a3 100644
--- a/services/memory/skill_extractor.py
+++ b/services/memory/skill_extractor.py
@@ -59,6 +59,10 @@ async def maybe_extract_skill(
     owner: Optional[str] = None,
 ):
     """Extract a skill if the agent run was complex enough."""
+    if not model:
+        logger.debug("[skill-extract] No model provided, skipping")
+        return None
+
     # Quiet by default; flip to DEBUG when chasing extractor issues.
     logger.debug(
         "[skill-extract] start: rounds=%d tools=%d model=%s owner=%s",
@@ -78,9 +82,23 @@ async def maybe_extract_skill(
             logger.debug("[skill-extract] no recent messages, skipping")
             return None
 
+        # Strip media (images/audio) from messages
+        stripped_recent = []
+        for msg in recent:
+            content = msg.get("content", "")
+            if isinstance(content, list):
+                text_only = [b for b in content if isinstance(b, dict) and b.get("type") == "text"]
+                if not text_only and content:
+                    continue
+                content = text_only
+            stripped_recent.append({"role": msg.get("role"), "content": content})
+
+        if not stripped_recent:
+            return None
+
         # Build conversation summary for extraction
         conv_lines = []
-        for msg in recent:
+        for msg in stripped_recent:
             role = msg.get("role", "?")
             content = msg.get("content", "")
             if isinstance(content, list):
diff --git a/src/endpoint_resolver.py b/src/endpoint_resolver.py
index f0cd163..55f1b43 100644
--- a/src/endpoint_resolver.py
+++ b/src/endpoint_resolver.py
@@ -234,9 +234,14 @@ def resolve_endpoint(
     ep_id = _stg(f"{setting_prefix}_endpoint_id")
     model = _stg(f"{setting_prefix}_model")
 
-    # Unset Utility means "same as Default Chat Model". This keeps background
-    # features usable out of the box and lets users override Utility only when
-    # they explicitly want a separate cheaper/faster model.
+    # If the specific endpoint is not configured, but the caller provided a 
+    # valid fallback (e.g. the active session model), use that immediately.
+    # This prevents background tasks from jumping to the global default_model
+    # when the user is mid-conversation with a different model.
+    if not ep_id and fallback_url and fallback_model:
+        return fallback_url, fallback_model, fallback_headers
+
+    # Unset Utility means "same as Default Chat Model".
     if setting_prefix == "utility" and not ep_id:
         ep_id = _stg("default_endpoint_id")
         model = _stg("default_model")