fix: recognize Gemma 4 as a thinking model and add context entry (#1642)

Gemma 4 returns reasoning_content in streaming responses via llama-server, but the model wasn't listed in _THINKING_MODEL_PATTERNS, causing reasoning tokens to be mishandled. Add "gemma" to the pattern list and register Gemma 4's 128K context window in KNOWN_CONTEXT_WINDOWS so the agent loop budgets context correctly. Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-06-03 15:23:18 +10:00
parent b45611e9c5
commit 39848a168b
2 changed files with 2 additions and 1 deletions
--- a/src/llm_core.py
+++ b/src/llm_core.py
@@ -420,7 +420,7 @@ def _restricts_temperature(model: str) -> bool:
    return any(m.startswith(p) or f"/{p}" in m for p in _FIXED_TEMPERATURE_MODELS)

 # Models that support structured thinking — may output </think> without opening tag
-_THINKING_MODEL_PATTERNS = ("qwen3", "qwq", "deepseek-r1", "deepseek-reasoner", "minimax", "m2-reap")
+_THINKING_MODEL_PATTERNS = ("qwen3", "qwq", "deepseek-r1", "deepseek-reasoner", "minimax", "m2-reap", "gemma")

 def _supports_thinking(model: str) -> bool:
    """Check if model supports structured thinking output."""
--- a/src/model_context.py
+++ b/src/model_context.py
@@ -83,6 +83,7 @@ KNOWN_CONTEXT_WINDOWS = {
    'gemini-2.0-flash': 1048576,
    'gemini-1.5-pro': 1048576,
    'gemini-1.5-flash': 1048576,
+    'gemma-4': 262144,
    'gemma-3': 128000,
    'gemma-2': 8192,