Chat metrics: surface backend generation speed

* Chat metrics: show backend's true generation t/s, not tokens÷wall-clock

The per-message tokens/sec read low and felt wrong because it was computed as
output_tokens / total_duration, where total_duration is wall-clock including
prefill, tool calls, and network — not pure decode time. llama.cpp already
reports the correct gen speed in its stream (timings.predicted_per_second), but
it was being dropped.

- llm_core.py: when parsing the OpenAI-compatible usage chunk, also read the
  sibling `timings` block llama.cpp includes — pass predicted_per_second through
  as gen_tps and prompt_per_second as prefill_tps on the usage event.
- agent_loop.py: capture backend_gen_tps/backend_prefill_tps from usage events;
  in _compute_final_metrics prefer backend_gen_tps over the wall-clock division
  when present (fall back to computed for cloud APIs that omit timings). Tag the
  result with tps_source ("backend" vs "computed") and surface prefill_tps.

Result: the displayed t/s now matches the model's real decode speed and is
stable regardless of prompt length (a long prefill no longer deflates it).

Checks: py_compile passes; verified extraction against a real llama.cpp final
chunk (gen 79 t/s surfaced vs the deflated wall-clock figure shown before).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

* Chat metrics: surface true t/s on the direct-chat path too

Follow-up to the gen-tps work: the non-agent direct-chat stream path in
chat_routes turned the raw `usage` event straight into a metrics event but only
copied token counts — it never set tokens_per_second or response_time. So simple
(non-tool) replies showed "Speed: n/a" / "Time: undefineds" and the chip fell
back to a bare token count ("27 tok") instead of t/s.

Map the usage event's gen_tps (llama.cpp timings.predicted_per_second, added in
the prior commit) into tokens_per_second here too, tag tps_source=backend, and
set response_time from wall-clock for the stats popup.

Checks: py_compile passes; verified llama.cpp emits usage+timings on the final
stream chunk (gen ~90 t/s) that this path consumes.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

* Tests: backend gen/prefill t/s passthrough and preference

Cover the two pieces of the true-t/s metric so it can be reviewed on its own:
- stream_llm surfaces llama.cpp's timings.predicted_per_second /
  prompt_per_second as gen_tps / prefill_tps on the usage event (captured
  llama.cpp final-chunk fixture), and omits them when the backend reports no
  timings.
- _compute_final_metrics prefers backend_gen_tps over output/wall-clock,
  tags tps_source ("backend" vs "computed"), and surfaces prefill_tps.

Reuses the fake-client stream harness from test_llm_core_streaming.py.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Leo
2026-06-02 13:52:08 +02:00
committed by GitHub
parent 4cec31d988
commit 6c15dc7d33
4 changed files with 213 additions and 2 deletions

View File

@@ -856,6 +856,15 @@ def setup_chat_routes(
pct = min(round((last_metrics["input_tokens"] / ctx.context_length) * 100, 1), 100.0) pct = min(round((last_metrics["input_tokens"] / ctx.context_length) * 100, 1), 100.0)
last_metrics["context_percent"] = pct last_metrics["context_percent"] = pct
last_metrics["context_length"] = ctx.context_length last_metrics["context_length"] = ctx.context_length
# The frontend reads `tokens_per_second`; the raw usage event
# carries the backend's true gen speed as `gen_tps` (llama.cpp
# timings). Map it through so this direct-chat path shows real
# t/s instead of "n/a" → falling back to a bare token count.
if last_metrics.get("gen_tps") and not last_metrics.get("tokens_per_second"):
last_metrics["tokens_per_second"] = last_metrics["gen_tps"]
last_metrics["tps_source"] = "backend"
# Wall-clock response time for the stats popup ("Time").
last_metrics.setdefault("response_time", round(time.time() - _chat_start, 2))
yield f'data: {json.dumps({"type": "metrics", "data": last_metrics})}\n\n' yield f'data: {json.dumps({"type": "metrics", "data": last_metrics})}\n\n'
except json.JSONDecodeError: except json.JSONDecodeError:
yield chunk yield chunk

View File

@@ -1176,6 +1176,8 @@ def _compute_final_metrics(
model: str = "", model: str = "",
last_round_input_tokens: int = 0, last_round_input_tokens: int = 0,
prep_timings: Optional[Dict[str, float]] = None, prep_timings: Optional[Dict[str, float]] = None,
backend_gen_tps: float = 0,
backend_prefill_tps: float = 0,
) -> dict: ) -> dict:
"""Compute token counts, TPS, and build the final metrics dict.""" """Compute token counts, TPS, and build the final metrics dict."""
if has_real_usage: if has_real_usage:
@@ -1188,7 +1190,15 @@ def _compute_final_metrics(
input_content += msg["content"] + "\n" input_content += msg["content"] + "\n"
input_tokens = len(input_content) // 4 input_tokens = len(input_content) // 4
output_tokens = len(full_response) // 4 output_tokens = len(full_response) // 4
tps = output_tokens / total_duration if total_duration > 0 else 0 # Prefer the backend's true generation speed (llama.cpp
# timings.predicted_per_second) — pure decode, no prefill/tool/network time.
# Fall back to tokens/wall-clock only when the backend didn't report it
# (e.g. cloud APIs without timings); that figure reads low because
# total_duration includes prefill + agent overhead.
if backend_gen_tps and backend_gen_tps > 0:
tps = backend_gen_tps
else:
tps = output_tokens / total_duration if total_duration > 0 else 0
# Use last round's input tokens for context % (peak usage) when available # Use last round's input tokens for context % (peak usage) when available
ctx_tokens = last_round_input_tokens if last_round_input_tokens > 0 else input_tokens ctx_tokens = last_round_input_tokens if last_round_input_tokens > 0 else input_tokens
ctx_pct = min(round((ctx_tokens / context_length) * 100, 1), 100.0) if context_length else 0 ctx_pct = min(round((ctx_tokens / context_length) * 100, 1), 100.0) if context_length else 0
@@ -1199,12 +1209,17 @@ def _compute_final_metrics(
"input_tokens": input_tokens, "input_tokens": input_tokens,
"output_tokens": output_tokens, "output_tokens": output_tokens,
"tokens_per_second": round(tps, 2), "tokens_per_second": round(tps, 2),
# True decode speed when the backend reported it; "computed" = the
# tokens/wall-clock fallback (reads low — includes prefill/overhead).
"tps_source": "backend" if (backend_gen_tps and backend_gen_tps > 0) else "computed",
"total_tokens": input_tokens + output_tokens, "total_tokens": input_tokens + output_tokens,
"context_length": context_length, "context_length": context_length,
"context_percent": ctx_pct, "context_percent": ctx_pct,
"usage_source": "real" if has_real_usage else "estimated", "usage_source": "real" if has_real_usage else "estimated",
"model": model, "model": model,
} }
if backend_prefill_tps and backend_prefill_tps > 0:
metrics["prefill_tps"] = round(backend_prefill_tps, 2)
if prep_timings: if prep_timings:
prep_total = round(sum(prep_timings.values()), 3) prep_total = round(sum(prep_timings.values()), 3)
metrics["agent_prep_time"] = prep_total metrics["agent_prep_time"] = prep_total
@@ -1506,6 +1521,8 @@ async def stream_agent_loop(
real_output_tokens = 0 real_output_tokens = 0
last_round_input_tokens = 0 # Last round's input tokens (for context % peak) last_round_input_tokens = 0 # Last round's input tokens (for context % peak)
has_real_usage = False has_real_usage = False
backend_gen_tps = 0 # backend-reported true gen speed (llama.cpp timings)
backend_prefill_tps = 0 # backend-reported prefill speed
total_tool_calls = 0 # for budget enforcement total_tool_calls = 0 # for budget enforcement
# Loop-breaker state. Small models (e.g. deepseek-v4-flash) can get # Loop-breaker state. Small models (e.g. deepseek-v4-flash) can get
@@ -1655,6 +1672,14 @@ async def stream_agent_loop(
real_output_tokens += u.get("output_tokens", 0) real_output_tokens += u.get("output_tokens", 0)
last_round_input_tokens = round_input last_round_input_tokens = round_input
has_real_usage = True has_real_usage = True
# Backend-reported TRUE generation speed (llama.cpp
# timings.predicted_per_second) — pure decode, excludes
# prefill/network. Preferred over tokens/wall-clock, which
# reads low. Keep the last round's value (the gen phase).
if u.get("gen_tps"):
backend_gen_tps = u["gen_tps"]
if u.get("prefill_tps"):
backend_prefill_tps = u["prefill_tps"]
elif data.get("type") == "fallback": elif data.get("type") == "fallback":
# The selected model failed and another answered; surface # The selected model failed and another answered; surface
# the notice so a misconfigured provider isn't masked. # the notice so a misconfigured provider isn't masked.
@@ -2181,6 +2206,8 @@ async def stream_agent_loop(
has_real_usage, tool_events, round_texts, model=model, has_real_usage, tool_events, round_texts, model=model,
last_round_input_tokens=last_round_input_tokens, last_round_input_tokens=last_round_input_tokens,
prep_timings=prep_timings, prep_timings=prep_timings,
backend_gen_tps=backend_gen_tps,
backend_prefill_tps=backend_prefill_tps,
) )
yield f"data: {json.dumps({'type': 'metrics', 'data': metrics})}\n\n" yield f"data: {json.dumps({'type': 'metrics', 'data': metrics})}\n\n"

View File

@@ -1183,7 +1183,19 @@ async def stream_llm(url: str, model: str, messages: List[Dict], temperature: fl
_delta0 = _choices[0].get("delta") if _choices else None _delta0 = _choices[0].get("delta") if _choices else None
if "usage" in j and _delta0 in (None, {}, {"content": None}): if "usage" in j and _delta0 in (None, {}, {"content": None}):
u = j["usage"] u = j["usage"]
yield f'data: {json.dumps({"type": "usage", "data": {"input_tokens": u.get("prompt_tokens", 0), "output_tokens": u.get("completion_tokens", 0)}})}\n\n' _usage_data = {"input_tokens": u.get("prompt_tokens", 0), "output_tokens": u.get("completion_tokens", 0)}
# llama.cpp puts a `timings` block alongside `usage` with the
# TRUE generation speed (predicted_per_second) — pure decode,
# excluding prefill/network. Pass it through so the UI shows the
# real gen t/s instead of recomputing tokens/wall-clock (which
# includes prefill and reads ~20-40% low). Prefill speed too.
_tm = j.get("timings")
if isinstance(_tm, dict):
if _tm.get("predicted_per_second"):
_usage_data["gen_tps"] = round(_tm["predicted_per_second"], 2)
if _tm.get("prompt_per_second"):
_usage_data["prefill_tps"] = round(_tm["prompt_per_second"], 2)
yield f'data: {json.dumps({"type": "usage", "data": _usage_data})}\n\n'
elif "choices" in j: elif "choices" in j:
delta = j["choices"][0].get("delta") or {} delta = j["choices"][0].get("delta") or {}
if isinstance(delta, dict): if isinstance(delta, dict):

163
tests/test_chat_metrics.py Normal file
View File

@@ -0,0 +1,163 @@
"""Backend-reported generation/prefill speed metrics.
llama.cpp emits a `timings` block alongside `usage` on the final stream chunk
with the TRUE decode speed (predicted_per_second) and prompt speed
(prompt_per_second). These are pure-phase numbers; the old per-message t/s was
output_tokens / wall-clock, which includes prefill + tool + network time and so
reads low (and sags as the prompt grows).
These tests lock in two things:
1. stream_llm passes the llama.cpp `timings` through on the usage event as
gen_tps / prefill_tps (captured-stream fixture), and omits them when the
backend doesn't report timings (e.g. cloud APIs).
2. _compute_final_metrics prefers the backend gen speed over wall-clock when
present, tags tps_source accordingly, and surfaces prefill_tps.
"""
import json
import asyncio
from src import llm_core
from src.agent_loop import _compute_final_metrics
# --- captured-stream harness (mirrors test_llm_core_streaming.py) -----------
class _FakeResp:
def __init__(self, lines):
self._lines = lines
self.status_code = 200
async def aiter_lines(self):
for ln in self._lines:
yield ln
async def aread(self):
return b""
class _FakeStreamCtx:
def __init__(self, lines):
self._lines = lines
async def __aenter__(self):
return _FakeResp(self._lines)
async def __aexit__(self, *a):
return False
class _FakeClient:
def __init__(self, lines):
self._lines = lines
def stream(self, method, url, **kw):
return _FakeStreamCtx(self._lines)
def _usage_event(monkeypatch, lines):
"""Drive stream_llm against canned SSE lines; return the usage event data."""
monkeypatch.setattr(llm_core, "_get_http_client", lambda: _FakeClient(lines))
monkeypatch.setattr(llm_core, "_is_host_dead", lambda u: False)
monkeypatch.setattr(llm_core, "note_model_activity", lambda *a, **k: None)
monkeypatch.setattr(llm_core, "_clear_host_dead", lambda *a, **k: None)
async def run():
usage = None
async for chunk in llm_core.stream_llm(
"http://127.0.0.1:8081/v1/chat/completions",
"qwen-local",
[{"role": "user", "content": "hi"}],
):
for ln in chunk.split("\n"):
ln = ln.strip()
if ln.startswith("data: ") and ln[6:] != "[DONE]":
try:
ev = json.loads(ln[6:])
except ValueError:
continue
if ev.get("type") == "usage":
usage = ev["data"]
return usage
return asyncio.run(run())
# A real llama.cpp final chunk carries `usage` (delta empty / choices []) with a
# sibling `timings` block. The decode speed here (78.91) is far above the
# wall-clock figure the old code would have shown.
_LLAMACPP_TIMINGS_STREAM = [
'data: ' + json.dumps({"choices": [{"index": 0, "delta": {"content": "Hi there"}}]}),
'data: ' + json.dumps({
"choices": [],
"object": "chat.completion.chunk",
"usage": {"prompt_tokens": 15, "completion_tokens": 42},
"timings": {
"prompt_n": 15, "prompt_per_second": 512.34,
"predicted_n": 42, "predicted_per_second": 78.91,
},
}),
"data: [DONE]",
]
def test_stream_llm_passes_through_llamacpp_timings(monkeypatch):
usage = _usage_event(monkeypatch, _LLAMACPP_TIMINGS_STREAM)
assert usage is not None, "no usage event was emitted"
assert usage["input_tokens"] == 15
assert usage["output_tokens"] == 42
# The timings block is surfaced as gen_tps / prefill_tps (rounded to 2dp).
assert usage["gen_tps"] == 78.91
assert usage["prefill_tps"] == 512.34
def test_stream_llm_omits_tps_when_backend_has_no_timings(monkeypatch):
# A backend (e.g. a cloud API) that reports usage but no `timings` block must
# not invent gen_tps/prefill_tps — the caller then falls back to wall-clock.
no_timings = [
'data: ' + json.dumps({"choices": [{"index": 0, "delta": {"content": "Hi"}}]}),
'data: ' + json.dumps({
"choices": [],
"usage": {"prompt_tokens": 8, "completion_tokens": 5},
}),
"data: [DONE]",
]
usage = _usage_event(monkeypatch, no_timings)
assert usage is not None
assert "gen_tps" not in usage
assert "prefill_tps" not in usage
# --- _compute_final_metrics preference logic --------------------------------
def _metrics(**overrides):
kwargs = dict(
messages=[{"role": "user", "content": "hi"}],
full_response="hello world",
total_duration=10.0, # wall-clock: 42/10 = 4.2 t/s (reads low)
time_to_first_token=0.5,
context_length=4096,
real_input_tokens=15,
real_output_tokens=42,
has_real_usage=True,
tool_events=[],
round_texts=[],
model="qwen-local",
)
kwargs.update(overrides)
return _compute_final_metrics(**kwargs)
def test_metrics_prefer_backend_gen_tps_over_wallclock():
m = _metrics(backend_gen_tps=78.91, backend_prefill_tps=512.34)
# Uses the backend's true decode speed, NOT 42/10 = 4.2.
assert m["tokens_per_second"] == 78.91
assert m["tps_source"] == "backend"
assert m["prefill_tps"] == 512.34
def test_metrics_fall_back_to_wallclock_without_backend_timings():
m = _metrics(backend_gen_tps=0, backend_prefill_tps=0)
# 42 output tokens / 10s wall-clock.
assert m["tokens_per_second"] == 4.2
assert m["tps_source"] == "computed"
assert "prefill_tps" not in m