From b7322f1c7e7ee34984e65609ee174b4f2fdc7613 Mon Sep 17 00:00:00 2001 From: Greg Scher Date: Mon, 23 Mar 2026 13:01:32 -0400 Subject: [PATCH] Fix HTML entity decoding and broaden OSINT dedup window - Replace single ' handler with generic numeric/hex entity decoder so ' and other unpadded entities are properly converted - Dedup urgent OSINT posts against all hot memory runs (last 3 sweeps) instead of only the previous sweep, preventing posts that drop out of one sweep from reappearing as "new" in the next Co-Authored-By: Claude Opus 4.6 (1M context) --- apis/sources/telegram.mjs | 5 ++++- lib/delta/engine.mjs | 7 +++++-- lib/delta/memory.mjs | 4 +++- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/apis/sources/telegram.mjs b/apis/sources/telegram.mjs index c4f6e17..5157d3d 100644 --- a/apis/sources/telegram.mjs +++ b/apis/sources/telegram.mjs @@ -169,7 +169,10 @@ function parseWebPreview(html, channelId) { .replace(/</g, '<') .replace(/>/g, '>') .replace(/"/g, '"') - .replace(/'/g, "'") + .replace(/�*39;/g, "'") + .replace(/�*27;/gi, "'") + .replace(/&#(\d+);/g, (_, n) => String.fromCharCode(Number(n))) + .replace(/&#x([0-9a-f]+);/gi, (_, h) => String.fromCharCode(parseInt(h, 16))) .replace(/ /g, ' ') .trim(); } diff --git a/lib/delta/engine.mjs b/lib/delta/engine.mjs index e285473..c98d50f 100644 --- a/lib/delta/engine.mjs +++ b/lib/delta/engine.mjs @@ -90,7 +90,7 @@ function contentHash(text) { * @param {object|null} previous - previous sweep's synthesized data (null on first run) * @param {object} [thresholdOverrides] - optional: { numeric: {...}, count: {...} } */ -export function computeDelta(current, previous, thresholdOverrides = {}) { +export function computeDelta(current, previous, thresholdOverrides = {}, priorRuns = []) { if (!previous) return null; if (!current) return null; @@ -152,8 +152,11 @@ export function computeDelta(current, previous, thresholdOverrides = {}) { // ─── New urgent Telegram posts (semantic dedup) ────────────────────── + // Dedup against all recent runs (not just the last one) to catch posts that + // drop out of one sweep but reappear in a later one. + const sources = priorRuns.length > 0 ? priorRuns : [previous]; const prevHashes = new Set( - (previous.tg?.urgent || []).map(p => contentHash(p.text)) + sources.flatMap(run => (run?.tg?.urgent || []).map(p => contentHash(p.text))) ); for (const post of (current.tg?.urgent || [])) { diff --git a/lib/delta/memory.mjs b/lib/delta/memory.mjs index 9d3c3b3..a551ebb 100644 --- a/lib/delta/memory.mjs +++ b/lib/delta/memory.mjs @@ -74,7 +74,9 @@ export class MemoryManager { // Add a new run to hot memory addRun(synthesizedData) { const previous = this.getLastRun(); - const delta = computeDelta(synthesizedData, previous); + // Collect urgent post hashes from all hot runs for broader dedup window + const priorRuns = this.hot.runs.map(r => r.data); + const delta = computeDelta(synthesizedData, previous, {}, priorRuns); // Compact the data for storage (strip large arrays) const compact = this._compactForStorage(synthesizedData);