From 3da4edb442e4cae3808e2e2f239d6f5c04c06fb4 Mon Sep 17 00:00:00 2001
From: Afonso Coutinho <afonso@omelhorsite.pt>
Date: Wed, 3 Jun 2026 05:36:57 +0100
Subject: [PATCH] fix: token usage dropped when it rides on a non-empty finish
 delta (#1703)

---
 src/llm_core.py                           |  14 ++-
 tests/test_llm_core_usage_finish_delta.py | 103 ++++++++++++++++++++++
 2 files changed, 116 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_llm_core_usage_finish_delta.py

diff --git a/src/llm_core.py b/src/llm_core.py
index b255e5e..b1b4acc 100644
--- a/src/llm_core.py
+++ b/src/llm_core.py
@@ -1323,7 +1323,19 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl
                                 # Usage chunk (from stream_options)
                                 _choices = j.get("choices") or []
                                 _delta0 = _choices[0].get("delta") if _choices else None
-                                if "usage" in j and _delta0 in (None, {}, {"content": None}):
+                                # Capture usage whenever the chunk carries it and
+                                # the delta has no actual output. Some gateways /
+                                # local servers attach usage to the FINAL delta,
+                                # which also carries role/finish_reason (so it is
+                                # not exactly None/{}/{"content": None}); gating on
+                                # those exact shapes discarded their token counts.
+                                _delta_has_output = isinstance(_delta0, dict) and (
+                                    _delta0.get("content")
+                                    or _delta0.get("reasoning_content")
+                                    or _delta0.get("reasoning")
+                                    or _delta0.get("tool_calls")
+                                )
+                                if "usage" in j and not _delta_has_output:
                                     u = j["usage"]
                                     _usage_data = {"input_tokens": u.get("prompt_tokens", 0), "output_tokens": u.get("completion_tokens", 0)}
                                     # llama.cpp puts a `timings` block alongside `usage` with the
diff --git a/tests/test_llm_core_usage_finish_delta.py b/tests/test_llm_core_usage_finish_delta.py
new file mode 100644
index 0000000..9f28f9f
--- /dev/null
+++ b/tests/test_llm_core_usage_finish_delta.py
@@ -0,0 +1,103 @@
+"""Token usage must be captured even when it rides on a non-empty finish delta.
+
+Some OpenAI-compatible gateways and local servers send usage on the FINAL
+streamed chunk, whose delta also carries role / finish_reason (e.g.
+{"delta": {"role": "assistant", "content": null}, "finish_reason": "stop"}).
+stream_llm only captured usage when the delta was exactly None / {} /
+{"content": None}, so those providers\' token accounting read zero.
+"""
+import asyncio
+import json
+
+from src import llm_core
+
+
+class _FakeResp:
+    def __init__(self, lines):
+        self._lines = lines
+        self.status_code = 200
+
+    async def aiter_lines(self):
+        for ln in self._lines:
+            yield ln
+
+    async def aread(self):
+        return b""
+
+
+class _FakeStreamCtx:
+    def __init__(self, lines):
+        self._lines = lines
+
+    async def __aenter__(self):
+        return _FakeResp(self._lines)
+
+    async def __aexit__(self, *a):
+        return False
+
+
+class _FakeClient:
+    def __init__(self, lines):
+        self._lines = lines
+
+    def stream(self, method, url, **kw):
+        return _FakeStreamCtx(self._lines)
+
+
+def _drive(monkeypatch, lines, model="gpt-4o-test"):
+    monkeypatch.setattr(llm_core, "_get_http_client", lambda: _FakeClient(lines))
+    monkeypatch.setattr(llm_core, "_is_host_dead", lambda u: False)
+    monkeypatch.setattr(llm_core, "note_model_activity", lambda *a, **k: None)
+    monkeypatch.setattr(llm_core, "_clear_host_dead", lambda *a, **k: None)
+    monkeypatch.setattr(llm_core, "_mark_host_dead", lambda *a, **k: False, raising=False)
+
+    async def run():
+        out = []
+        async for chunk in llm_core.stream_llm(
+            "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions",
+            model, [{"role": "user", "content": "hi"}],
+            headers={"Authorization": "Bearer k"},
+        ):
+            out.append(chunk)
+        return "".join(out)
+
+    return asyncio.run(run())
+
+
+def _usage_events(blob):
+    events = []
+    for ln in blob.split("\n"):
+        ln = ln.strip()
+        if ln.startswith("data: ") and ln[6:] != "[DONE]":
+            try:
+                j = json.loads(ln[6:])
+            except ValueError:
+                continue
+            if j.get("type") == "usage":
+                events.append(j["data"])
+    return events
+
+
+def test_usage_on_finish_delta_with_role_is_captured(monkeypatch):
+    lines = [
+        'data: ' + json.dumps({"choices": [{"delta": {"content": "Hello"}}]}),
+        'data: ' + json.dumps({
+            "choices": [{"delta": {"role": "assistant", "content": None}, "finish_reason": "stop"}],
+            "usage": {"prompt_tokens": 9, "completion_tokens": 1},
+        }),
+        'data: [DONE]',
+    ]
+    usage = _usage_events(_drive(monkeypatch, lines))
+    assert usage, "usage on a non-empty finish delta was dropped"
+    assert usage[-1] == {"input_tokens": 9, "output_tokens": 1}
+
+
+def test_usage_on_empty_choices_chunk_still_captured(monkeypatch):
+    # canonical OpenAI include_usage: final chunk has empty choices + usage
+    lines = [
+        'data: ' + json.dumps({"choices": [{"delta": {"content": "Hi"}}]}),
+        'data: ' + json.dumps({"choices": [], "usage": {"prompt_tokens": 4, "completion_tokens": 2}}),
+        'data: [DONE]',
+    ]
+    usage = _usage_events(_drive(monkeypatch, lines))
+    assert usage and usage[-1] == {"input_tokens": 4, "output_tokens": 2}