From 3da4edb442e4cae3808e2e2f239d6f5c04c06fb4 Mon Sep 17 00:00:00 2001 From: Afonso Coutinho Date: Wed, 3 Jun 2026 05:36:57 +0100 Subject: [PATCH] fix: token usage dropped when it rides on a non-empty finish delta (#1703) --- src/llm_core.py | 14 ++- tests/test_llm_core_usage_finish_delta.py | 103 ++++++++++++++++++++++ 2 files changed, 116 insertions(+), 1 deletion(-) create mode 100644 tests/test_llm_core_usage_finish_delta.py diff --git a/src/llm_core.py b/src/llm_core.py index b255e5e..b1b4acc 100644 --- a/src/llm_core.py +++ b/src/llm_core.py @@ -1323,7 +1323,19 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl # Usage chunk (from stream_options) _choices = j.get("choices") or [] _delta0 = _choices[0].get("delta") if _choices else None - if "usage" in j and _delta0 in (None, {}, {"content": None}): + # Capture usage whenever the chunk carries it and + # the delta has no actual output. Some gateways / + # local servers attach usage to the FINAL delta, + # which also carries role/finish_reason (so it is + # not exactly None/{}/{"content": None}); gating on + # those exact shapes discarded their token counts. + _delta_has_output = isinstance(_delta0, dict) and ( + _delta0.get("content") + or _delta0.get("reasoning_content") + or _delta0.get("reasoning") + or _delta0.get("tool_calls") + ) + if "usage" in j and not _delta_has_output: u = j["usage"] _usage_data = {"input_tokens": u.get("prompt_tokens", 0), "output_tokens": u.get("completion_tokens", 0)} # llama.cpp puts a `timings` block alongside `usage` with the diff --git a/tests/test_llm_core_usage_finish_delta.py b/tests/test_llm_core_usage_finish_delta.py new file mode 100644 index 0000000..9f28f9f --- /dev/null +++ b/tests/test_llm_core_usage_finish_delta.py @@ -0,0 +1,103 @@ +"""Token usage must be captured even when it rides on a non-empty finish delta. + +Some OpenAI-compatible gateways and local servers send usage on the FINAL +streamed chunk, whose delta also carries role / finish_reason (e.g. +{"delta": {"role": "assistant", "content": null}, "finish_reason": "stop"}). +stream_llm only captured usage when the delta was exactly None / {} / +{"content": None}, so those providers\' token accounting read zero. +""" +import asyncio +import json + +from src import llm_core + + +class _FakeResp: + def __init__(self, lines): + self._lines = lines + self.status_code = 200 + + async def aiter_lines(self): + for ln in self._lines: + yield ln + + async def aread(self): + return b"" + + +class _FakeStreamCtx: + def __init__(self, lines): + self._lines = lines + + async def __aenter__(self): + return _FakeResp(self._lines) + + async def __aexit__(self, *a): + return False + + +class _FakeClient: + def __init__(self, lines): + self._lines = lines + + def stream(self, method, url, **kw): + return _FakeStreamCtx(self._lines) + + +def _drive(monkeypatch, lines, model="gpt-4o-test"): + monkeypatch.setattr(llm_core, "_get_http_client", lambda: _FakeClient(lines)) + monkeypatch.setattr(llm_core, "_is_host_dead", lambda u: False) + monkeypatch.setattr(llm_core, "note_model_activity", lambda *a, **k: None) + monkeypatch.setattr(llm_core, "_clear_host_dead", lambda *a, **k: None) + monkeypatch.setattr(llm_core, "_mark_host_dead", lambda *a, **k: False, raising=False) + + async def run(): + out = [] + async for chunk in llm_core.stream_llm( + "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions", + model, [{"role": "user", "content": "hi"}], + headers={"Authorization": "Bearer k"}, + ): + out.append(chunk) + return "".join(out) + + return asyncio.run(run()) + + +def _usage_events(blob): + events = [] + for ln in blob.split("\n"): + ln = ln.strip() + if ln.startswith("data: ") and ln[6:] != "[DONE]": + try: + j = json.loads(ln[6:]) + except ValueError: + continue + if j.get("type") == "usage": + events.append(j["data"]) + return events + + +def test_usage_on_finish_delta_with_role_is_captured(monkeypatch): + lines = [ + 'data: ' + json.dumps({"choices": [{"delta": {"content": "Hello"}}]}), + 'data: ' + json.dumps({ + "choices": [{"delta": {"role": "assistant", "content": None}, "finish_reason": "stop"}], + "usage": {"prompt_tokens": 9, "completion_tokens": 1}, + }), + 'data: [DONE]', + ] + usage = _usage_events(_drive(monkeypatch, lines)) + assert usage, "usage on a non-empty finish delta was dropped" + assert usage[-1] == {"input_tokens": 9, "output_tokens": 1} + + +def test_usage_on_empty_choices_chunk_still_captured(monkeypatch): + # canonical OpenAI include_usage: final chunk has empty choices + usage + lines = [ + 'data: ' + json.dumps({"choices": [{"delta": {"content": "Hi"}}]}), + 'data: ' + json.dumps({"choices": [], "usage": {"prompt_tokens": 4, "completion_tokens": 2}}), + 'data: [DONE]', + ] + usage = _usage_events(_drive(monkeypatch, lines)) + assert usage and usage[-1] == {"input_tokens": 4, "output_tokens": 2}