Alert operators when dashboard data is stale #12
@@ -6,6 +6,8 @@ PORT=3117
|
||||
REFRESH_INTERVAL_MINUTES=15
|
||||
AUTO_OPEN_BROWSER=false
|
||||
STALE_DATA_MAX_AGE_MINUTES=60
|
||||
STALE_ALERT_COOLDOWN_MINUTES=60
|
||||
DASHBOARD_URL=
|
||||
TERMINAL_ACTIONS_ENABLED=true
|
||||
SWEEP_TOKEN=
|
||||
BRIEF_VERBOSITY=standard
|
||||
|
||||
@@ -130,6 +130,8 @@ PORT=3117
|
||||
REFRESH_INTERVAL_MINUTES=15
|
||||
AUTO_OPEN_BROWSER=false
|
||||
STALE_DATA_MAX_AGE_MINUTES=60
|
||||
STALE_ALERT_COOLDOWN_MINUTES=60
|
||||
DASHBOARD_URL=https://intelligence.example.internal
|
||||
TERMINAL_ACTIONS_ENABLED=true
|
||||
SWEEP_TOKEN=
|
||||
BRIEF_VERBOSITY=standard
|
||||
@@ -183,6 +185,8 @@ LLM_MODEL=your-model
|
||||
|
||||
For Pangolin or another reverse proxy, forward HTTP traffic to `intelligence-terminal:3117` (or the `PORT` you set). Missing API keys do not crash sweeps; affected sources are reported as degraded in `/api/health`.
|
||||
|
||||
When data remains stale past `STALE_DATA_MAX_AGE_MINUTES`, the server sends an operator alert through configured Telegram/Discord channels after failed or degraded sweep attempts. `STALE_ALERT_COOLDOWN_MINUTES` prevents repeated stale alerts from spamming every refresh interval. Set `DASHBOARD_URL` to the Pangolin/public URL you want included in those alerts.
|
||||
|
||||
The dashboard Terminal Actions panel can trigger `status`, `sweep`, and `brief` through `/api/action`. Leave `TERMINAL_ACTIONS_ENABLED=true` for a private home-server deployment. For an internet-exposed deployment, set `SWEEP_TOKEN` and pass it through trusted automation, or set `TERMINAL_ACTIONS_ENABLED=false` to disable browser-triggered actions. If you protect actions with `SWEEP_TOKEN`, the browser can send it from `localStorage.crucix_sweep_token`.
|
||||
|
||||
#### Scenario Watchlist
|
||||
@@ -556,6 +560,9 @@ All settings are in `.env` with sensible defaults:
|
||||
|----------|---------|-------------|
|
||||
| `PORT` | `3117` | Dashboard server port |
|
||||
| `REFRESH_INTERVAL_MINUTES` | `15` | Auto-refresh interval |
|
||||
| `STALE_DATA_MAX_AGE_MINUTES` | `60` | Data age threshold for stale health state |
|
||||
| `STALE_ALERT_COOLDOWN_MINUTES` | `60` | Minimum time between repeated operator stale-data alerts |
|
||||
| `DASHBOARD_URL` | local URL | Dashboard URL included in operator alerts |
|
||||
| `LLM_PROVIDER` | disabled | `anthropic`, `openai`, `gemini`, `codex`, `openrouter`, `minimax`, `mistral`, or `grok` |
|
||||
| `LLM_API_KEY` | — | API key (not needed for codex) |
|
||||
| `LLM_MODEL` | per-provider default | Override model selection |
|
||||
|
||||
@@ -23,6 +23,8 @@ export default {
|
||||
refreshIntervalMinutes: intEnv('REFRESH_INTERVAL_MINUTES', 15),
|
||||
autoOpenBrowser: boolEnv('AUTO_OPEN_BROWSER', false),
|
||||
staleDataMaxAgeMinutes: intEnv('STALE_DATA_MAX_AGE_MINUTES', 60),
|
||||
staleAlertCooldownMinutes: intEnv('STALE_ALERT_COOLDOWN_MINUTES', 60),
|
||||
dashboardUrl: process.env.DASHBOARD_URL || null,
|
||||
sweepToken: process.env.SWEEP_TOKEN || null,
|
||||
terminalActionsEnabled: boolEnv('TERMINAL_ACTIONS_ENABLED', true),
|
||||
|
||||
|
||||
52
lib/stale-alerts.mjs
Normal file
52
lib/stale-alerts.mjs
Normal file
@@ -0,0 +1,52 @@
|
||||
const DEFAULT_COOLDOWN_MS = 60 * 60 * 1000;
|
||||
|
||||
export function shouldSendStaleAlert(health, state = {}, opts = {}) {
|
||||
const now = opts.now ?? Date.now();
|
||||
const cooldownMs = opts.cooldownMs ?? DEFAULT_COOLDOWN_MS;
|
||||
if (!health?.stale) {
|
||||
state.lastStaleAlertKey = null;
|
||||
return { send: false, reason: 'not_stale' };
|
||||
}
|
||||
|
||||
const key = [
|
||||
health.lastSuccessfulSweep || 'never',
|
||||
health.lastSweepError || 'no-error',
|
||||
health.sourcesFailed || 0,
|
||||
health.sourcesDegraded || 0,
|
||||
].join('|');
|
||||
|
||||
if (state.lastStaleAlertKey === key && now - (state.lastStaleAlertAt || 0) < cooldownMs) {
|
||||
return { send: false, reason: 'cooldown', key };
|
||||
}
|
||||
|
||||
state.lastStaleAlertKey = key;
|
||||
state.lastStaleAlertAt = now;
|
||||
return { send: true, reason: 'stale', key };
|
||||
}
|
||||
|
||||
export function formatStaleAlert(health, opts = {}) {
|
||||
const dashboardUrl = opts.dashboardUrl || 'http://localhost:3117';
|
||||
const context = opts.context || 'scheduled sweep';
|
||||
const ageMinutes = health.dataAgeSeconds == null ? 'unknown' : Math.floor(health.dataAgeSeconds / 60);
|
||||
const affected = (health.sourceHealth || [])
|
||||
.filter(s => (s.status && s.status !== 'ok') || s.error)
|
||||
.slice(0, 6)
|
||||
.map(s => `- ${s.name || s.n || 'source'}: ${s.status || 'degraded'}${s.error ? ` (${String(s.error).slice(0, 100)})` : ''}`);
|
||||
|
||||
return [
|
||||
'*CRUCIX STALE DATA ALERT*',
|
||||
'',
|
||||
`Context: ${context}`,
|
||||
`Status: ${health.status || 'unknown'}`,
|
||||
`Data age: ${ageMinutes} minutes`,
|
||||
`Last successful sweep: ${health.lastSuccessfulSweep || 'never'}`,
|
||||
`Last attempted sweep: ${health.lastSweep || 'never'}`,
|
||||
`Last error: ${health.lastSweepError || 'none'}`,
|
||||
`Sources: ${health.sourcesOk || 0} OK / ${health.sourcesDegraded || 0} degraded / ${health.sourcesFailed || 0} failed`,
|
||||
'',
|
||||
'*Affected sources*',
|
||||
affected.length ? affected.join('\n') : '- No per-source errors available',
|
||||
'',
|
||||
`Dashboard: ${dashboardUrl}`,
|
||||
].join('\n');
|
||||
}
|
||||
30
server.mjs
30
server.mjs
@@ -18,6 +18,7 @@ import { TelegramAlerter } from './lib/alerts/telegram.mjs';
|
||||
import { DiscordAlerter } from './lib/alerts/discord.mjs';
|
||||
import { getFetchMetrics } from './apis/utils/fetch.mjs';
|
||||
import { IntelligenceStore } from './lib/intelligence-store.mjs';
|
||||
import { formatStaleAlert, shouldSendStaleAlert } from './lib/stale-alerts.mjs';
|
||||
import { evaluateScenarios } from './lib/scenarios.mjs';
|
||||
|
||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||
@@ -40,6 +41,7 @@ let sweepStartedAt = null; // Timestamp when current/last sweep started
|
||||
let sweepInProgress = false;
|
||||
const startTime = Date.now();
|
||||
const sseClients = new Set();
|
||||
const staleAlertState = {};
|
||||
|
||||
// === Delta/Memory ===
|
||||
const memory = new MemoryManager(RUNS_DIR);
|
||||
@@ -412,6 +414,31 @@ function buildHealth() {
|
||||
};
|
||||
}
|
||||
|
||||
async function notifyIfDataStale(context = 'scheduled sweep') {
|
||||
const health = buildHealth();
|
||||
const decision = shouldSendStaleAlert(health, staleAlertState, {
|
||||
cooldownMs: config.staleAlertCooldownMinutes * 60 * 1000,
|
||||
});
|
||||
if (!decision.send) return false;
|
||||
|
||||
const dashboardUrl = config.dashboardUrl || `http://localhost:${config.port}`;
|
||||
const message = formatStaleAlert(health, { dashboardUrl, context });
|
||||
const sends = [];
|
||||
if (telegramAlerter.isConfigured) sends.push(telegramAlerter.sendMessage(message));
|
||||
if (discordAlerter.isConfigured) sends.push(discordAlerter.sendAlert(message));
|
||||
|
||||
if (sends.length === 0) {
|
||||
console.warn('[Crucix] Data is stale but no operator alert channel is configured');
|
||||
return false;
|
||||
}
|
||||
|
||||
const results = await Promise.allSettled(sends);
|
||||
const sent = results.some(r => r.status === 'fulfilled' && (r.value === true || r.value?.ok === true));
|
||||
if (sent) console.warn('[Crucix] Operator stale-data alert sent');
|
||||
else console.warn('[Crucix] Operator stale-data alert attempted but no channel accepted it');
|
||||
return sent;
|
||||
}
|
||||
|
||||
function buildBrief(data) {
|
||||
const verbosity = config.telegram.briefVerbosity || 'standard';
|
||||
const delta = memory.getLastDelta();
|
||||
@@ -562,6 +589,9 @@ async function runSweepCycle() {
|
||||
broadcast({ type: 'sweep_error', error: err.message });
|
||||
} finally {
|
||||
sweepInProgress = false;
|
||||
await notifyIfDataStale(lastSweepError ? 'failed sweep' : 'completed sweep').catch(err => {
|
||||
console.error('[Crucix] Stale-data operator alert failed:', err.message);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@ import test from 'node:test';
|
||||
import assert from 'node:assert/strict';
|
||||
import { readFileSync } from 'node:fs';
|
||||
import { safeFetch, safeFetchText, getFetchMetrics } from '../apis/utils/fetch.mjs';
|
||||
import { formatStaleAlert, shouldSendStaleAlert } from '../lib/stale-alerts.mjs';
|
||||
|
||||
test('safeFetch reports HTML as degraded JSON response', async () => {
|
||||
const originalFetch = globalThis.fetch;
|
||||
@@ -100,6 +101,63 @@ test('safeFetchText returns text and byte count', async () => {
|
||||
}
|
||||
});
|
||||
|
||||
test('stale alert is skipped for fresh health and resets active key', () => {
|
||||
const state = { lastStaleAlertKey: 'old', lastStaleAlertAt: 100 };
|
||||
const decision = shouldSendStaleAlert({ stale: false }, state, { now: 200 });
|
||||
assert.equal(decision.send, false);
|
||||
assert.equal(decision.reason, 'not_stale');
|
||||
assert.equal(state.lastStaleAlertKey, null);
|
||||
});
|
||||
|
||||
test('stale alert sends once and deduplicates during cooldown', () => {
|
||||
const state = {};
|
||||
const health = {
|
||||
stale: true,
|
||||
lastSuccessfulSweep: '2026-05-17T08:00:00.000Z',
|
||||
lastSweepError: 'network timeout',
|
||||
sourcesFailed: 2,
|
||||
sourcesDegraded: 1,
|
||||
};
|
||||
|
||||
const first = shouldSendStaleAlert(health, state, { now: 1_000, cooldownMs: 60_000 });
|
||||
const second = shouldSendStaleAlert(health, state, { now: 2_000, cooldownMs: 60_000 });
|
||||
|
||||
assert.equal(first.send, true);
|
||||
assert.equal(second.send, false);
|
||||
assert.equal(second.reason, 'cooldown');
|
||||
});
|
||||
|
||||
test('stale alert repeats after cooldown', () => {
|
||||
const state = {};
|
||||
const health = { stale: true, lastSuccessfulSweep: 'a', lastSweepError: 'timeout', sourcesFailed: 1 };
|
||||
|
||||
assert.equal(shouldSendStaleAlert(health, state, { now: 1_000, cooldownMs: 60_000 }).send, true);
|
||||
assert.equal(shouldSendStaleAlert(health, state, { now: 62_000, cooldownMs: 60_000 }).send, true);
|
||||
});
|
||||
|
||||
test('stale alert message includes operator context and affected sources', () => {
|
||||
const message = formatStaleAlert({
|
||||
status: 'stale',
|
||||
stale: true,
|
||||
dataAgeSeconds: 7200,
|
||||
lastSuccessfulSweep: '2026-05-17T08:00:00.000Z',
|
||||
lastSweep: '2026-05-17T10:00:00.000Z',
|
||||
lastSweepError: 'GDELT timeout',
|
||||
sourcesOk: 20,
|
||||
sourcesDegraded: 3,
|
||||
sourcesFailed: 2,
|
||||
sourceHealth: [
|
||||
{ name: 'GDELT', status: 'degraded', error: 'timeout' },
|
||||
{ name: 'Reddit', status: 'no_credentials' },
|
||||
],
|
||||
}, { dashboardUrl: 'https://terminal.example.test', context: 'failed sweep' });
|
||||
|
||||
assert.match(message, /CRUCIX STALE DATA ALERT/);
|
||||
assert.match(message, /Data age: 120 minutes/);
|
||||
assert.match(message, /GDELT: degraded \(timeout\)/);
|
||||
assert.match(message, /Dashboard: https:\/\/terminal\.example\.test/);
|
||||
});
|
||||
|
||||
test('scenario watchlist feature is wired into sweep, briefing, and dashboard', () => {
|
||||
const scenarios = readFileSync(new URL('../lib/scenarios.mjs', import.meta.url), 'utf8');
|
||||
const server = readFileSync(new URL('../server.mjs', import.meta.url), 'utf8');
|
||||
|
||||
Reference in New Issue
Block a user