Installing a heavy dependency like vllm crashes in a "stale — restarting" loop:
it restarts mid-install, reuses the cached wheels, then stalls again.
The download/install watchdog (cookbookRunning.js) keyed its stall signal purely
off the downloaded-byte counter ("1.81G/2.49G"). A dependency install spends long
stretches with NO byte counter — pip dependency resolution and the native CUDA
build/compile — so the signal froze and after STALE_PROGRESS_MS the watchdog
declared it stale and auto-restarted it mid-build, looping forever.
Extract the signal into a pure computeProgressSignal (cookbookProgressSignal.js):
keep the byte counter for the download phase (so a genuinely stuck download is
still caught, and an animating-but-frozen ETA frame is NOT mistaken for progress),
and when there's no byte counter fall back to a fingerprint of the output tail so
resolver/compile lines count as progress. Only a truly frozen tail now reads as
stalled.
Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
29
static/js/cookbookProgressSignal.js
Normal file
29
static/js/cookbookProgressSignal.js
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
// static/js/cookbookProgressSignal.js
|
||||||
|
/**
|
||||||
|
* Liveness signal for a running cookbook download/install. The watchdog treats a
|
||||||
|
* task as stalled when this signal stays unchanged for too long, so it must move
|
||||||
|
* whenever the task is genuinely making progress.
|
||||||
|
*
|
||||||
|
* During a model DOWNLOAD the honest signal is the downloaded-byte counter
|
||||||
|
* ("1.81G" from "1.81G/2.49G"): it climbs while transferring and freezes when
|
||||||
|
* stuck — and unlike a % bar or speed/ETA it doesn't keep animating on a frozen
|
||||||
|
* frame. That path is kept exactly as-is.
|
||||||
|
*
|
||||||
|
* But a dependency install (e.g. vllm) spends long stretches with NO byte
|
||||||
|
* counter — pip dependency resolution and the native CUDA build/compile. A
|
||||||
|
* byte-only signal freezes there, so the watchdog falsely declares the install
|
||||||
|
* stale and restarts it mid-build, looping forever (#1568). When there's no byte
|
||||||
|
* counter, fall back to a fingerprint of the output tail: resolver/compile lines
|
||||||
|
* keep changing while the process is alive, and only a truly hung process leaves
|
||||||
|
* the tail frozen.
|
||||||
|
*
|
||||||
|
* Pure (string in, string out) so it's unit-testable; cookbookRunning.js pulls
|
||||||
|
* in browser-only modules and can't load under node.
|
||||||
|
*/
|
||||||
|
export function computeProgressSignal(bytes, dlAgg, lastPct, snapshot) {
|
||||||
|
if (bytes) return bytes;
|
||||||
|
const base = dlAgg != null ? String(dlAgg) : (lastPct || '0');
|
||||||
|
// No byte counter → use the output tail so a build/resolve phase that emits new
|
||||||
|
// lines counts as progress instead of a false stall (#1568).
|
||||||
|
return base + '|' + String(snapshot || '').slice(-300);
|
||||||
|
}
|
||||||
@@ -7,6 +7,7 @@
|
|||||||
import uiModule from './ui.js';
|
import uiModule from './ui.js';
|
||||||
import { _diagnose, _showDiagnosis, _clearDiagnosis } from './cookbook-diagnosis.js';
|
import { _diagnose, _showDiagnosis, _clearDiagnosis } from './cookbook-diagnosis.js';
|
||||||
import { registerMenuDismiss } from './escMenuStack.js';
|
import { registerMenuDismiss } from './escMenuStack.js';
|
||||||
|
import { computeProgressSignal } from './cookbookProgressSignal.js';
|
||||||
|
|
||||||
// Human-friendly badge label for a task's internal status. Avoids surfacing
|
// Human-friendly badge label for a task's internal status. Avoids surfacing
|
||||||
// the word "error" in the sidebar — a server the user stopped or one that
|
// the word "error" in the sidebar — a server the user stopped or one that
|
||||||
@@ -2443,7 +2444,11 @@ async function _reconnectTask(el, task) {
|
|||||||
// back to %/aggregate only when no byte counter is present.
|
// back to %/aggregate only when no byte counter is present.
|
||||||
const _byteMatches = [...snapshot.matchAll(/([\d.]+\s?[KMGT])B?\s*\/\s*[\d.]+\s?[KMGT]B?/gi)];
|
const _byteMatches = [...snapshot.matchAll(/([\d.]+\s?[KMGT])B?\s*\/\s*[\d.]+\s?[KMGT]B?/gi)];
|
||||||
const _bytes = _byteMatches.length ? _byteMatches[_byteMatches.length - 1][1].replace(/\s/g, '') : null;
|
const _bytes = _byteMatches.length ? _byteMatches[_byteMatches.length - 1][1].replace(/\s/g, '') : null;
|
||||||
const curProgress = _bytes || (_dlAgg != null ? String(_dlAgg) : (lastPct || '0'));
|
// When there's no byte counter (pip resolve / native build phase of a
|
||||||
|
// dependency install), key off the output tail so new build lines count
|
||||||
|
// as progress — otherwise a long quiet build is falsely declared stale
|
||||||
|
// and restarted mid-build, looping forever (#1568).
|
||||||
|
const curProgress = computeProgressSignal(_bytes, _dlAgg, lastPct, snapshot);
|
||||||
const _fetchPctMatches = [...snapshot.matchAll(/Fetching\s+\d+\s+files:\s*(\d+)%/g)];
|
const _fetchPctMatches = [...snapshot.matchAll(/Fetching\s+\d+\s+files:\s*(\d+)%/g)];
|
||||||
const _fetchPct = _fetchPctMatches.length ? parseInt(_fetchPctMatches[_fetchPctMatches.length - 1][1]) : null;
|
const _fetchPct = _fetchPctMatches.length ? parseInt(_fetchPctMatches[_fetchPctMatches.length - 1][1]) : null;
|
||||||
const _startupStalled = !_bytes && ((_dlAgg === 0) || (_fetchPct === 0)) && curProgress === '0';
|
const _startupStalled = !_bytes && ((_dlAgg === 0) || (_fetchPct === 0)) && curProgress === '0';
|
||||||
|
|||||||
85
tests/test_cookbook_progress_signal_js.py
Normal file
85
tests/test_cookbook_progress_signal_js.py
Normal file
@@ -0,0 +1,85 @@
|
|||||||
|
"""Regression for issue #1568 — installing a heavy dependency (vllm) in the
|
||||||
|
Cookbook crashes in a "stale — restarting" loop.
|
||||||
|
|
||||||
|
The download/install watchdog (static/js/cookbookRunning.js) decides a task is
|
||||||
|
stalled when its progress signal stays unchanged for STALE_PROGRESS_MS. That
|
||||||
|
signal used to be the downloaded-byte counter only, which freezes during the long
|
||||||
|
no-byte-counter phases of a dependency install — pip dependency resolution and
|
||||||
|
the native CUDA build — so the watchdog falsely declared the install stale and
|
||||||
|
restarted it mid-build, looping forever.
|
||||||
|
|
||||||
|
computeProgressSignal (cookbookProgressSignal.js) keeps the byte signal for the
|
||||||
|
download phase (so a genuinely stuck download is still caught) and falls back to
|
||||||
|
the output tail when there's no byte counter, so build/resolver output counts as
|
||||||
|
progress. Pure function → executed under node here (cookbookRunning.js pulls in
|
||||||
|
browser-only modules and can't load).
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
import textwrap
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
_REPO = Path(__file__).resolve().parent.parent
|
||||||
|
_HAS_NODE = shutil.which("node") is not None
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
def node_available():
|
||||||
|
if not _HAS_NODE:
|
||||||
|
pytest.skip("node binary not on PATH")
|
||||||
|
|
||||||
|
|
||||||
|
def _run_node(script: str) -> dict:
|
||||||
|
res = subprocess.run(
|
||||||
|
["node", "--input-type=module", "-e", script],
|
||||||
|
cwd=_REPO, capture_output=True, timeout=15, text=True,
|
||||||
|
)
|
||||||
|
if res.returncode != 0:
|
||||||
|
raise AssertionError(f"node failed:\n{res.stderr}")
|
||||||
|
out = [ln for ln in res.stdout.splitlines() if ln.strip()]
|
||||||
|
if not out:
|
||||||
|
raise AssertionError("node produced no stdout")
|
||||||
|
return json.loads(out[-1])
|
||||||
|
|
||||||
|
|
||||||
|
def test_download_phase_uses_byte_counter_and_ignores_animated_tail(node_available):
|
||||||
|
"""During a download the byte counter is the signal; a stuck download whose
|
||||||
|
only the ETA/spinner keeps animating must yield the SAME signal (so a real
|
||||||
|
download stall is still detected)."""
|
||||||
|
script = textwrap.dedent("""
|
||||||
|
const { computeProgressSignal } = await import('./static/js/cookbookProgressSignal.js');
|
||||||
|
// Same downloaded bytes, different animated ETA/spinner in the tail.
|
||||||
|
const a = computeProgressSignal('1.81G', null, '73', 'Downloading 73%| 1.81G/2.49G [eta 0:05:11]');
|
||||||
|
const b = computeProgressSignal('1.81G', null, '73', 'Downloading 73%| 1.81G/2.49G [eta 0:09:42] -');
|
||||||
|
// Bytes climb -> different.
|
||||||
|
const c = computeProgressSignal('2.10G', null, '84', 'Downloading 84%| 2.10G/2.49G');
|
||||||
|
console.log(JSON.stringify({ a, b, stuck_same: a === b, climbed_diff: a !== c }));
|
||||||
|
""")
|
||||||
|
out = _run_node(script)
|
||||||
|
assert out["a"] == "1.81G"
|
||||||
|
assert out["stuck_same"] is True, "a stuck download (only ETA animating) must stay the same signal"
|
||||||
|
assert out["climbed_diff"] is True, "climbing bytes must change the signal"
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_phase_progresses_on_new_output(node_available):
|
||||||
|
"""The #1568 case: no byte counter (pip resolve / CUDA build). New build
|
||||||
|
output must change the signal so it isn't falsely declared stale — whereas a
|
||||||
|
byte-only signal would read '0' for both and trip the stall timer."""
|
||||||
|
script = textwrap.dedent("""
|
||||||
|
const { computeProgressSignal } = await import('./static/js/cookbookProgressSignal.js');
|
||||||
|
const s1 = computeProgressSignal(null, null, null, 'Building wheel for vllm ... compiling csrc/attention.cu');
|
||||||
|
const s2 = computeProgressSignal(null, null, null, 'Building wheel for vllm ... compiling csrc/cache_kernels.cu');
|
||||||
|
const hung1 = computeProgressSignal(null, null, null, 'Building wheel for vllm ... (no output)');
|
||||||
|
const hung2 = computeProgressSignal(null, null, null, 'Building wheel for vllm ... (no output)');
|
||||||
|
console.log(JSON.stringify({
|
||||||
|
build_progresses: s1 !== s2,
|
||||||
|
true_hang_stays: hung1 === hung2,
|
||||||
|
}));
|
||||||
|
""")
|
||||||
|
out = _run_node(script)
|
||||||
|
assert out["build_progresses"] is True, "new build output must count as progress (#1568)"
|
||||||
|
assert out["true_hang_stays"] is True, "a genuinely frozen tail must still read as stalled"
|
||||||
Reference in New Issue
Block a user