diff --git a/static/js/cookbookProgressSignal.js b/static/js/cookbookProgressSignal.js new file mode 100644 index 0000000..3346b4e --- /dev/null +++ b/static/js/cookbookProgressSignal.js @@ -0,0 +1,29 @@ +// static/js/cookbookProgressSignal.js +/** + * Liveness signal for a running cookbook download/install. The watchdog treats a + * task as stalled when this signal stays unchanged for too long, so it must move + * whenever the task is genuinely making progress. + * + * During a model DOWNLOAD the honest signal is the downloaded-byte counter + * ("1.81G" from "1.81G/2.49G"): it climbs while transferring and freezes when + * stuck — and unlike a % bar or speed/ETA it doesn't keep animating on a frozen + * frame. That path is kept exactly as-is. + * + * But a dependency install (e.g. vllm) spends long stretches with NO byte + * counter — pip dependency resolution and the native CUDA build/compile. A + * byte-only signal freezes there, so the watchdog falsely declares the install + * stale and restarts it mid-build, looping forever (#1568). When there's no byte + * counter, fall back to a fingerprint of the output tail: resolver/compile lines + * keep changing while the process is alive, and only a truly hung process leaves + * the tail frozen. + * + * Pure (string in, string out) so it's unit-testable; cookbookRunning.js pulls + * in browser-only modules and can't load under node. + */ +export function computeProgressSignal(bytes, dlAgg, lastPct, snapshot) { + if (bytes) return bytes; + const base = dlAgg != null ? String(dlAgg) : (lastPct || '0'); + // No byte counter → use the output tail so a build/resolve phase that emits new + // lines counts as progress instead of a false stall (#1568). + return base + '|' + String(snapshot || '').slice(-300); +} diff --git a/static/js/cookbookRunning.js b/static/js/cookbookRunning.js index 3395053..fb1d810 100644 --- a/static/js/cookbookRunning.js +++ b/static/js/cookbookRunning.js @@ -7,6 +7,7 @@ import uiModule from './ui.js'; import { _diagnose, _showDiagnosis, _clearDiagnosis } from './cookbook-diagnosis.js'; import { registerMenuDismiss } from './escMenuStack.js'; +import { computeProgressSignal } from './cookbookProgressSignal.js'; // Human-friendly badge label for a task's internal status. Avoids surfacing // the word "error" in the sidebar — a server the user stopped or one that @@ -2443,7 +2444,11 @@ async function _reconnectTask(el, task) { // back to %/aggregate only when no byte counter is present. const _byteMatches = [...snapshot.matchAll(/([\d.]+\s?[KMGT])B?\s*\/\s*[\d.]+\s?[KMGT]B?/gi)]; const _bytes = _byteMatches.length ? _byteMatches[_byteMatches.length - 1][1].replace(/\s/g, '') : null; - const curProgress = _bytes || (_dlAgg != null ? String(_dlAgg) : (lastPct || '0')); + // When there's no byte counter (pip resolve / native build phase of a + // dependency install), key off the output tail so new build lines count + // as progress — otherwise a long quiet build is falsely declared stale + // and restarted mid-build, looping forever (#1568). + const curProgress = computeProgressSignal(_bytes, _dlAgg, lastPct, snapshot); const _fetchPctMatches = [...snapshot.matchAll(/Fetching\s+\d+\s+files:\s*(\d+)%/g)]; const _fetchPct = _fetchPctMatches.length ? parseInt(_fetchPctMatches[_fetchPctMatches.length - 1][1]) : null; const _startupStalled = !_bytes && ((_dlAgg === 0) || (_fetchPct === 0)) && curProgress === '0'; diff --git a/tests/test_cookbook_progress_signal_js.py b/tests/test_cookbook_progress_signal_js.py new file mode 100644 index 0000000..4067f70 --- /dev/null +++ b/tests/test_cookbook_progress_signal_js.py @@ -0,0 +1,85 @@ +"""Regression for issue #1568 — installing a heavy dependency (vllm) in the +Cookbook crashes in a "stale — restarting" loop. + +The download/install watchdog (static/js/cookbookRunning.js) decides a task is +stalled when its progress signal stays unchanged for STALE_PROGRESS_MS. That +signal used to be the downloaded-byte counter only, which freezes during the long +no-byte-counter phases of a dependency install — pip dependency resolution and +the native CUDA build — so the watchdog falsely declared the install stale and +restarted it mid-build, looping forever. + +computeProgressSignal (cookbookProgressSignal.js) keeps the byte signal for the +download phase (so a genuinely stuck download is still caught) and falls back to +the output tail when there's no byte counter, so build/resolver output counts as +progress. Pure function → executed under node here (cookbookRunning.js pulls in +browser-only modules and can't load). +""" + +import json +import shutil +import subprocess +import textwrap +from pathlib import Path + +import pytest + +_REPO = Path(__file__).resolve().parent.parent +_HAS_NODE = shutil.which("node") is not None + + +@pytest.fixture(scope="module") +def node_available(): + if not _HAS_NODE: + pytest.skip("node binary not on PATH") + + +def _run_node(script: str) -> dict: + res = subprocess.run( + ["node", "--input-type=module", "-e", script], + cwd=_REPO, capture_output=True, timeout=15, text=True, + ) + if res.returncode != 0: + raise AssertionError(f"node failed:\n{res.stderr}") + out = [ln for ln in res.stdout.splitlines() if ln.strip()] + if not out: + raise AssertionError("node produced no stdout") + return json.loads(out[-1]) + + +def test_download_phase_uses_byte_counter_and_ignores_animated_tail(node_available): + """During a download the byte counter is the signal; a stuck download whose + only the ETA/spinner keeps animating must yield the SAME signal (so a real + download stall is still detected).""" + script = textwrap.dedent(""" + const { computeProgressSignal } = await import('./static/js/cookbookProgressSignal.js'); + // Same downloaded bytes, different animated ETA/spinner in the tail. + const a = computeProgressSignal('1.81G', null, '73', 'Downloading 73%| 1.81G/2.49G [eta 0:05:11]'); + const b = computeProgressSignal('1.81G', null, '73', 'Downloading 73%| 1.81G/2.49G [eta 0:09:42] -'); + // Bytes climb -> different. + const c = computeProgressSignal('2.10G', null, '84', 'Downloading 84%| 2.10G/2.49G'); + console.log(JSON.stringify({ a, b, stuck_same: a === b, climbed_diff: a !== c })); + """) + out = _run_node(script) + assert out["a"] == "1.81G" + assert out["stuck_same"] is True, "a stuck download (only ETA animating) must stay the same signal" + assert out["climbed_diff"] is True, "climbing bytes must change the signal" + + +def test_build_phase_progresses_on_new_output(node_available): + """The #1568 case: no byte counter (pip resolve / CUDA build). New build + output must change the signal so it isn't falsely declared stale — whereas a + byte-only signal would read '0' for both and trip the stall timer.""" + script = textwrap.dedent(""" + const { computeProgressSignal } = await import('./static/js/cookbookProgressSignal.js'); + const s1 = computeProgressSignal(null, null, null, 'Building wheel for vllm ... compiling csrc/attention.cu'); + const s2 = computeProgressSignal(null, null, null, 'Building wheel for vllm ... compiling csrc/cache_kernels.cu'); + const hung1 = computeProgressSignal(null, null, null, 'Building wheel for vllm ... (no output)'); + const hung2 = computeProgressSignal(null, null, null, 'Building wheel for vllm ... (no output)'); + console.log(JSON.stringify({ + build_progresses: s1 !== s2, + true_hang_stays: hung1 === hung2, + })); + """) + out = _run_node(script) + assert out["build_progresses"] is True, "new build output must count as progress (#1568)" + assert out["true_hang_stays"] is True, "a genuinely frozen tail must still read as stalled"