From 6c15dc7d334b7e59e188245bcaffa6a388576b36 Mon Sep 17 00:00:00 2001 From: Leo Date: Tue, 2 Jun 2026 13:52:08 +0200 Subject: [PATCH] Chat metrics: surface backend generation speed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Chat metrics: show backend's true generation t/s, not tokens÷wall-clock The per-message tokens/sec read low and felt wrong because it was computed as output_tokens / total_duration, where total_duration is wall-clock including prefill, tool calls, and network — not pure decode time. llama.cpp already reports the correct gen speed in its stream (timings.predicted_per_second), but it was being dropped. - llm_core.py: when parsing the OpenAI-compatible usage chunk, also read the sibling `timings` block llama.cpp includes — pass predicted_per_second through as gen_tps and prompt_per_second as prefill_tps on the usage event. - agent_loop.py: capture backend_gen_tps/backend_prefill_tps from usage events; in _compute_final_metrics prefer backend_gen_tps over the wall-clock division when present (fall back to computed for cloud APIs that omit timings). Tag the result with tps_source ("backend" vs "computed") and surface prefill_tps. Result: the displayed t/s now matches the model's real decode speed and is stable regardless of prompt length (a long prefill no longer deflates it). Checks: py_compile passes; verified extraction against a real llama.cpp final chunk (gen 79 t/s surfaced vs the deflated wall-clock figure shown before). Co-Authored-By: Claude Opus 4.8 (1M context) * Chat metrics: surface true t/s on the direct-chat path too Follow-up to the gen-tps work: the non-agent direct-chat stream path in chat_routes turned the raw `usage` event straight into a metrics event but only copied token counts — it never set tokens_per_second or response_time. So simple (non-tool) replies showed "Speed: n/a" / "Time: undefineds" and the chip fell back to a bare token count ("27 tok") instead of t/s. Map the usage event's gen_tps (llama.cpp timings.predicted_per_second, added in the prior commit) into tokens_per_second here too, tag tps_source=backend, and set response_time from wall-clock for the stats popup. Checks: py_compile passes; verified llama.cpp emits usage+timings on the final stream chunk (gen ~90 t/s) that this path consumes. Co-Authored-By: Claude Opus 4.8 (1M context) * Tests: backend gen/prefill t/s passthrough and preference Cover the two pieces of the true-t/s metric so it can be reviewed on its own: - stream_llm surfaces llama.cpp's timings.predicted_per_second / prompt_per_second as gen_tps / prefill_tps on the usage event (captured llama.cpp final-chunk fixture), and omits them when the backend reports no timings. - _compute_final_metrics prefers backend_gen_tps over output/wall-clock, tags tps_source ("backend" vs "computed"), and surfaces prefill_tps. Reuses the fake-client stream harness from test_llm_core_streaming.py. Co-Authored-By: Claude Opus 4.8 (1M context) --------- Co-authored-by: Claude Opus 4.8 (1M context) --- routes/chat_routes.py | 9 ++ src/agent_loop.py | 29 ++++++- src/llm_core.py | 14 +++- tests/test_chat_metrics.py | 163 +++++++++++++++++++++++++++++++++++++ 4 files changed, 213 insertions(+), 2 deletions(-) create mode 100644 tests/test_chat_metrics.py diff --git a/routes/chat_routes.py b/routes/chat_routes.py index df3ae7a..3f7738a 100644 --- a/routes/chat_routes.py +++ b/routes/chat_routes.py @@ -856,6 +856,15 @@ def setup_chat_routes( pct = min(round((last_metrics["input_tokens"] / ctx.context_length) * 100, 1), 100.0) last_metrics["context_percent"] = pct last_metrics["context_length"] = ctx.context_length + # The frontend reads `tokens_per_second`; the raw usage event + # carries the backend's true gen speed as `gen_tps` (llama.cpp + # timings). Map it through so this direct-chat path shows real + # t/s instead of "n/a" → falling back to a bare token count. + if last_metrics.get("gen_tps") and not last_metrics.get("tokens_per_second"): + last_metrics["tokens_per_second"] = last_metrics["gen_tps"] + last_metrics["tps_source"] = "backend" + # Wall-clock response time for the stats popup ("Time"). + last_metrics.setdefault("response_time", round(time.time() - _chat_start, 2)) yield f'data: {json.dumps({"type": "metrics", "data": last_metrics})}\n\n' except json.JSONDecodeError: yield chunk diff --git a/src/agent_loop.py b/src/agent_loop.py index 94bed46..0e3dce9 100644 --- a/src/agent_loop.py +++ b/src/agent_loop.py @@ -1176,6 +1176,8 @@ def _compute_final_metrics( model: str = "", last_round_input_tokens: int = 0, prep_timings: Optional[Dict[str, float]] = None, + backend_gen_tps: float = 0, + backend_prefill_tps: float = 0, ) -> dict: """Compute token counts, TPS, and build the final metrics dict.""" if has_real_usage: @@ -1188,7 +1190,15 @@ def _compute_final_metrics( input_content += msg["content"] + "\n" input_tokens = len(input_content) // 4 output_tokens = len(full_response) // 4 - tps = output_tokens / total_duration if total_duration > 0 else 0 + # Prefer the backend's true generation speed (llama.cpp + # timings.predicted_per_second) — pure decode, no prefill/tool/network time. + # Fall back to tokens/wall-clock only when the backend didn't report it + # (e.g. cloud APIs without timings); that figure reads low because + # total_duration includes prefill + agent overhead. + if backend_gen_tps and backend_gen_tps > 0: + tps = backend_gen_tps + else: + tps = output_tokens / total_duration if total_duration > 0 else 0 # Use last round's input tokens for context % (peak usage) when available ctx_tokens = last_round_input_tokens if last_round_input_tokens > 0 else input_tokens ctx_pct = min(round((ctx_tokens / context_length) * 100, 1), 100.0) if context_length else 0 @@ -1199,12 +1209,17 @@ def _compute_final_metrics( "input_tokens": input_tokens, "output_tokens": output_tokens, "tokens_per_second": round(tps, 2), + # True decode speed when the backend reported it; "computed" = the + # tokens/wall-clock fallback (reads low — includes prefill/overhead). + "tps_source": "backend" if (backend_gen_tps and backend_gen_tps > 0) else "computed", "total_tokens": input_tokens + output_tokens, "context_length": context_length, "context_percent": ctx_pct, "usage_source": "real" if has_real_usage else "estimated", "model": model, } + if backend_prefill_tps and backend_prefill_tps > 0: + metrics["prefill_tps"] = round(backend_prefill_tps, 2) if prep_timings: prep_total = round(sum(prep_timings.values()), 3) metrics["agent_prep_time"] = prep_total @@ -1506,6 +1521,8 @@ async def stream_agent_loop( real_output_tokens = 0 last_round_input_tokens = 0 # Last round's input tokens (for context % peak) has_real_usage = False + backend_gen_tps = 0 # backend-reported true gen speed (llama.cpp timings) + backend_prefill_tps = 0 # backend-reported prefill speed total_tool_calls = 0 # for budget enforcement # Loop-breaker state. Small models (e.g. deepseek-v4-flash) can get @@ -1655,6 +1672,14 @@ async def stream_agent_loop( real_output_tokens += u.get("output_tokens", 0) last_round_input_tokens = round_input has_real_usage = True + # Backend-reported TRUE generation speed (llama.cpp + # timings.predicted_per_second) — pure decode, excludes + # prefill/network. Preferred over tokens/wall-clock, which + # reads low. Keep the last round's value (the gen phase). + if u.get("gen_tps"): + backend_gen_tps = u["gen_tps"] + if u.get("prefill_tps"): + backend_prefill_tps = u["prefill_tps"] elif data.get("type") == "fallback": # The selected model failed and another answered; surface # the notice so a misconfigured provider isn't masked. @@ -2181,6 +2206,8 @@ async def stream_agent_loop( has_real_usage, tool_events, round_texts, model=model, last_round_input_tokens=last_round_input_tokens, prep_timings=prep_timings, + backend_gen_tps=backend_gen_tps, + backend_prefill_tps=backend_prefill_tps, ) yield f"data: {json.dumps({'type': 'metrics', 'data': metrics})}\n\n" diff --git a/src/llm_core.py b/src/llm_core.py index 12ef35b..a407f97 100644 --- a/src/llm_core.py +++ b/src/llm_core.py @@ -1183,7 +1183,19 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl _delta0 = _choices[0].get("delta") if _choices else None if "usage" in j and _delta0 in (None, {}, {"content": None}): u = j["usage"] - yield f'data: {json.dumps({"type": "usage", "data": {"input_tokens": u.get("prompt_tokens", 0), "output_tokens": u.get("completion_tokens", 0)}})}\n\n' + _usage_data = {"input_tokens": u.get("prompt_tokens", 0), "output_tokens": u.get("completion_tokens", 0)} + # llama.cpp puts a `timings` block alongside `usage` with the + # TRUE generation speed (predicted_per_second) — pure decode, + # excluding prefill/network. Pass it through so the UI shows the + # real gen t/s instead of recomputing tokens/wall-clock (which + # includes prefill and reads ~20-40% low). Prefill speed too. + _tm = j.get("timings") + if isinstance(_tm, dict): + if _tm.get("predicted_per_second"): + _usage_data["gen_tps"] = round(_tm["predicted_per_second"], 2) + if _tm.get("prompt_per_second"): + _usage_data["prefill_tps"] = round(_tm["prompt_per_second"], 2) + yield f'data: {json.dumps({"type": "usage", "data": _usage_data})}\n\n' elif "choices" in j: delta = j["choices"][0].get("delta") or {} if isinstance(delta, dict): diff --git a/tests/test_chat_metrics.py b/tests/test_chat_metrics.py new file mode 100644 index 0000000..9a218fa --- /dev/null +++ b/tests/test_chat_metrics.py @@ -0,0 +1,163 @@ +"""Backend-reported generation/prefill speed metrics. + +llama.cpp emits a `timings` block alongside `usage` on the final stream chunk +with the TRUE decode speed (predicted_per_second) and prompt speed +(prompt_per_second). These are pure-phase numbers; the old per-message t/s was +output_tokens / wall-clock, which includes prefill + tool + network time and so +reads low (and sags as the prompt grows). + +These tests lock in two things: + 1. stream_llm passes the llama.cpp `timings` through on the usage event as + gen_tps / prefill_tps (captured-stream fixture), and omits them when the + backend doesn't report timings (e.g. cloud APIs). + 2. _compute_final_metrics prefers the backend gen speed over wall-clock when + present, tags tps_source accordingly, and surfaces prefill_tps. +""" +import json +import asyncio + +from src import llm_core +from src.agent_loop import _compute_final_metrics + + +# --- captured-stream harness (mirrors test_llm_core_streaming.py) ----------- + +class _FakeResp: + def __init__(self, lines): + self._lines = lines + self.status_code = 200 + + async def aiter_lines(self): + for ln in self._lines: + yield ln + + async def aread(self): + return b"" + + +class _FakeStreamCtx: + def __init__(self, lines): + self._lines = lines + + async def __aenter__(self): + return _FakeResp(self._lines) + + async def __aexit__(self, *a): + return False + + +class _FakeClient: + def __init__(self, lines): + self._lines = lines + + def stream(self, method, url, **kw): + return _FakeStreamCtx(self._lines) + + +def _usage_event(monkeypatch, lines): + """Drive stream_llm against canned SSE lines; return the usage event data.""" + monkeypatch.setattr(llm_core, "_get_http_client", lambda: _FakeClient(lines)) + monkeypatch.setattr(llm_core, "_is_host_dead", lambda u: False) + monkeypatch.setattr(llm_core, "note_model_activity", lambda *a, **k: None) + monkeypatch.setattr(llm_core, "_clear_host_dead", lambda *a, **k: None) + + async def run(): + usage = None + async for chunk in llm_core.stream_llm( + "http://127.0.0.1:8081/v1/chat/completions", + "qwen-local", + [{"role": "user", "content": "hi"}], + ): + for ln in chunk.split("\n"): + ln = ln.strip() + if ln.startswith("data: ") and ln[6:] != "[DONE]": + try: + ev = json.loads(ln[6:]) + except ValueError: + continue + if ev.get("type") == "usage": + usage = ev["data"] + return usage + + return asyncio.run(run()) + + +# A real llama.cpp final chunk carries `usage` (delta empty / choices []) with a +# sibling `timings` block. The decode speed here (78.91) is far above the +# wall-clock figure the old code would have shown. +_LLAMACPP_TIMINGS_STREAM = [ + 'data: ' + json.dumps({"choices": [{"index": 0, "delta": {"content": "Hi there"}}]}), + 'data: ' + json.dumps({ + "choices": [], + "object": "chat.completion.chunk", + "usage": {"prompt_tokens": 15, "completion_tokens": 42}, + "timings": { + "prompt_n": 15, "prompt_per_second": 512.34, + "predicted_n": 42, "predicted_per_second": 78.91, + }, + }), + "data: [DONE]", +] + + +def test_stream_llm_passes_through_llamacpp_timings(monkeypatch): + usage = _usage_event(monkeypatch, _LLAMACPP_TIMINGS_STREAM) + assert usage is not None, "no usage event was emitted" + assert usage["input_tokens"] == 15 + assert usage["output_tokens"] == 42 + # The timings block is surfaced as gen_tps / prefill_tps (rounded to 2dp). + assert usage["gen_tps"] == 78.91 + assert usage["prefill_tps"] == 512.34 + + +def test_stream_llm_omits_tps_when_backend_has_no_timings(monkeypatch): + # A backend (e.g. a cloud API) that reports usage but no `timings` block must + # not invent gen_tps/prefill_tps — the caller then falls back to wall-clock. + no_timings = [ + 'data: ' + json.dumps({"choices": [{"index": 0, "delta": {"content": "Hi"}}]}), + 'data: ' + json.dumps({ + "choices": [], + "usage": {"prompt_tokens": 8, "completion_tokens": 5}, + }), + "data: [DONE]", + ] + usage = _usage_event(monkeypatch, no_timings) + assert usage is not None + assert "gen_tps" not in usage + assert "prefill_tps" not in usage + + +# --- _compute_final_metrics preference logic -------------------------------- + +def _metrics(**overrides): + kwargs = dict( + messages=[{"role": "user", "content": "hi"}], + full_response="hello world", + total_duration=10.0, # wall-clock: 42/10 = 4.2 t/s (reads low) + time_to_first_token=0.5, + context_length=4096, + real_input_tokens=15, + real_output_tokens=42, + has_real_usage=True, + tool_events=[], + round_texts=[], + model="qwen-local", + ) + kwargs.update(overrides) + return _compute_final_metrics(**kwargs) + + +def test_metrics_prefer_backend_gen_tps_over_wallclock(): + m = _metrics(backend_gen_tps=78.91, backend_prefill_tps=512.34) + # Uses the backend's true decode speed, NOT 42/10 = 4.2. + assert m["tokens_per_second"] == 78.91 + assert m["tps_source"] == "backend" + assert m["prefill_tps"] == 512.34 + + +def test_metrics_fall_back_to_wallclock_without_backend_timings(): + m = _metrics(backend_gen_tps=0, backend_prefill_tps=0) + # 42 output tokens / 10s wall-clock. + assert m["tokens_per_second"] == 4.2 + assert m["tps_source"] == "computed" + assert "prefill_tps" not in m