Fix Telegram dedup identity and legacy Markdown escaping

This commit is contained in:
calesthio
2026-03-24 18:48:55 -07:00
parent b7322f1c7e
commit 5c08355e38
3 changed files with 40 additions and 8 deletions

View File

@@ -66,9 +66,9 @@ const RISK_KEYS = ['vix', 'hy_spread', 'urgent_posts', 'conflict_events', 'therm
// ─── Semantic Hashing for Telegram Posts ─────────────────────────────────────
/**
* Produce a normalized hash of a post's content.
* Strips timestamps, normalizes numbers, lowercases — so "BREAKING: 5 missiles at 14:32"
* and "Breaking: 7 missiles at 15:01" produce the same hash (both are "missile strike" signals).
* Produce a normalized semantic hash of a post's content.
* This is intentionally lossy and is only safe as a fallback when a stable
* post identity is unavailable.
*/
function contentHash(text) {
if (!text) return '';
@@ -83,12 +83,32 @@ function contentHash(text) {
return createHash('sha256').update(normalized).digest('hex').substring(0, 12);
}
function stablePostKey(post) {
if (!post) return '';
const sourceId = post.postId || post.messageId || '';
const channelId = post.channel || post.chat || '';
const date = post.date || '';
const text = (post.text || '').trim().substring(0, 200);
if (sourceId) return `id:${sourceId}`;
if (channelId && date) {
return createHash('sha256')
.update(`${channelId}|${date}|${text}`)
.digest('hex')
.substring(0, 16);
}
return `semantic:${contentHash(post.text)}`;
}
// ─── Core Delta Computation ──────────────────────────────────────────────────
/**
* @param {object} current - current sweep's synthesized data
* @param {object|null} previous - previous sweep's synthesized data (null on first run)
* @param {object} [thresholdOverrides] - optional: { numeric: {...}, count: {...} }
* @param {Array<object>} [priorRuns] - optional compacted prior runs for broader dedup
*/
export function computeDelta(current, previous, thresholdOverrides = {}, priorRuns = []) {
if (!previous) return null;
@@ -153,14 +173,16 @@ export function computeDelta(current, previous, thresholdOverrides = {}, priorRu
// ─── New urgent Telegram posts (semantic dedup) ──────────────────────
// Dedup against all recent runs (not just the last one) to catch posts that
// drop out of one sweep but reappear in a later one.
// drop out of one sweep but reappear in a later one. Use stable post identity
// where possible so updated posts are not collapsed into earlier alerts just
// because their text is semantically similar.
const sources = priorRuns.length > 0 ? priorRuns : [previous];
const prevHashes = new Set(
sources.flatMap(run => (run?.tg?.urgent || []).map(p => contentHash(p.text)))
sources.flatMap(run => (run?.tg?.urgent || []).map(stablePostKey)).filter(Boolean)
);
for (const post of (current.tg?.urgent || [])) {
const hash = contentHash(post.text);
const hash = stablePostKey(post);
if (hash && !prevHashes.has(hash)) {
signals.new.push({
key: `tg_urgent:${hash}`,

View File

@@ -201,7 +201,15 @@ export class MemoryManager {
bls: data.bls,
treasury: data.treasury,
gscpi: data.gscpi,
tg: { posts: data.tg?.posts, urgent: (data.tg?.urgent || []).map(p => ({ text: p.text, date: p.date })) },
tg: {
posts: data.tg?.posts,
urgent: (data.tg?.urgent || []).map(p => ({
text: p.text,
date: p.date,
channel: p.channel || p.chat || null,
postId: p.postId || null,
})),
},
thermal: (data.thermal || []).map(t => ({ region: t.region, det: t.det, night: t.night, hc: t.hc })),
air: (data.air || []).map(a => ({ region: a.region, total: a.total })),
nuke: (data.nuke || []).map(n => ({ site: n.site, anom: n.anom, cpm: n.cpm })),