Fix HTML entity decoding and broaden OSINT dedup window

- Replace single ' handler with generic numeric/hex entity decoder
  so ' and other unpadded entities are properly converted
- Dedup urgent OSINT posts against all hot memory runs (last 3 sweeps)
  instead of only the previous sweep, preventing posts that drop out
  of one sweep from reappearing as "new" in the next

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Greg Scher
2026-03-23 13:01:32 -04:00
parent 31c305cbbb
commit b7322f1c7e
3 changed files with 12 additions and 4 deletions

View File

@@ -169,7 +169,10 @@ function parseWebPreview(html, channelId) {
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#039;/g, "'")
.replace(/&#0*39;/g, "'")
.replace(/&#x0*27;/gi, "'")
.replace(/&#(\d+);/g, (_, n) => String.fromCharCode(Number(n)))
.replace(/&#x([0-9a-f]+);/gi, (_, h) => String.fromCharCode(parseInt(h, 16)))
.replace(/&nbsp;/g, ' ')
.trim();
}

View File

@@ -90,7 +90,7 @@ function contentHash(text) {
* @param {object|null} previous - previous sweep's synthesized data (null on first run)
* @param {object} [thresholdOverrides] - optional: { numeric: {...}, count: {...} }
*/
export function computeDelta(current, previous, thresholdOverrides = {}) {
export function computeDelta(current, previous, thresholdOverrides = {}, priorRuns = []) {
if (!previous) return null;
if (!current) return null;
@@ -152,8 +152,11 @@ export function computeDelta(current, previous, thresholdOverrides = {}) {
// ─── New urgent Telegram posts (semantic dedup) ──────────────────────
// Dedup against all recent runs (not just the last one) to catch posts that
// drop out of one sweep but reappear in a later one.
const sources = priorRuns.length > 0 ? priorRuns : [previous];
const prevHashes = new Set(
(previous.tg?.urgent || []).map(p => contentHash(p.text))
sources.flatMap(run => (run?.tg?.urgent || []).map(p => contentHash(p.text)))
);
for (const post of (current.tg?.urgent || [])) {

View File

@@ -74,7 +74,9 @@ export class MemoryManager {
// Add a new run to hot memory
addRun(synthesizedData) {
const previous = this.getLastRun();
const delta = computeDelta(synthesizedData, previous);
// Collect urgent post hashes from all hot runs for broader dedup window
const priorRuns = this.hot.runs.map(r => r.data);
const delta = computeDelta(synthesizedData, previous, {}, priorRuns);
// Compact the data for storage (strip large arrays)
const compact = this._compactForStorage(synthesizedData);