diff --git a/src/llm_core.py b/src/llm_core.py index be31ac5..a929edc 100644 --- a/src/llm_core.py +++ b/src/llm_core.py @@ -1398,7 +1398,7 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl j = json.loads(data) # Usage chunk (from stream_options) _choices = j.get("choices") or [] - _delta0 = _choices[0].get("delta") if _choices else None + _delta0 = _choices[0].get("delta") if (_choices and _choices[0] is not None) else None # Capture usage whenever the chunk carries it and # the delta has no actual output. Some gateways / # local servers attach usage to the FINAL delta, @@ -1412,7 +1412,7 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl or _delta0.get("tool_calls") ) if "usage" in j and not _delta_has_output: - u = j["usage"] + u = j["usage"] or {} _usage_data = {"input_tokens": u.get("prompt_tokens", 0), "output_tokens": u.get("completion_tokens", 0)} # llama.cpp puts a `timings` block alongside `usage` with the # TRUE generation speed (predicted_per_second) — pure decode, @@ -1427,7 +1427,10 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl _usage_data["prefill_tps"] = round(_tm["prompt_per_second"], 2) yield f'data: {json.dumps({"type": "usage", "data": _usage_data})}\n\n' elif "choices" in j: - delta = j["choices"][0].get("delta") or {} + _c0 = (j["choices"] or [None])[0] + if _c0 is None: + continue + delta = _c0.get("delta") or {} if isinstance(delta, dict): # Text content # Reasoning tokens (VLLM --reasoning-parser, e.g. Qwen3/DeepSeek-R1, Nemotron). vLLM 0.20.2 / NIM emit the field as `reasoning`; older builds use `reasoning_content`. Accept either. @@ -1446,6 +1449,8 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl yield f'data: {json.dumps({"delta": content})}\n\n' # Native tool calls — accumulate across chunks for tc in delta.get("tool_calls") or []: + if tc is None: + continue func = tc.get("function") or {} raw_idx = tc.get("index") if raw_idx is None: diff --git a/tests/test_llm_core_usage_finish_delta.py b/tests/test_llm_core_usage_finish_delta.py index 9f28f9f..507939d 100644 --- a/tests/test_llm_core_usage_finish_delta.py +++ b/tests/test_llm_core_usage_finish_delta.py @@ -101,3 +101,56 @@ def test_usage_on_empty_choices_chunk_still_captured(monkeypatch): ] usage = _usage_events(_drive(monkeypatch, lines)) assert usage and usage[-1] == {"input_tokens": 4, "output_tokens": 2} + + +def test_null_choice_chunk_does_not_crash(monkeypatch): + # Some providers emit {"choices": [null]} as a heartbeat/keepalive chunk. + # The parser must silently skip it rather than crashing on None.get("delta"). + lines = [ + 'data: ' + json.dumps({"choices": [{"delta": {"content": "Hello"}}]}), + 'data: ' + json.dumps({"choices": [None]}), + 'data: [DONE]', + ] + result = _drive(monkeypatch, lines) + assert "Hello" in result + + +def test_null_choice_with_null_usage_does_not_crash(monkeypatch): + # Chunk with both choices:[null] and usage:null — neither field should panic. + lines = [ + 'data: ' + json.dumps({"choices": [{"delta": {"content": "Hi"}}]}), + 'data: ' + json.dumps({"choices": [None], "usage": None}), + 'data: [DONE]', + ] + result = _drive(monkeypatch, lines) + assert "Hi" in result + + +def test_null_tool_call_in_delta_is_skipped(monkeypatch): + # Some providers include null entries in the tool_calls array alongside + # valid calls. The null entry must be skipped; the valid call must survive. + lines = [ + 'data: ' + json.dumps({ + "choices": [{ + "delta": { + "tool_calls": [ + None, + {"index": 0, "function": {"name": "get_weather", "arguments": '{"city":'}}, + ] + } + }] + }), + 'data: ' + json.dumps({ + "choices": [{ + "delta": { + "tool_calls": [ + {"index": 0, "function": {"name": "", "arguments": '"London"}'}}, + ] + } + }] + }), + 'data: [DONE]', + ] + result = _drive(monkeypatch, lines) + # The stream completes without error; the valid tool call was accumulated. + assert result is not None