Fix Telegram dedup identity and legacy Markdown escaping
This commit is contained in:
@@ -66,9 +66,9 @@ const RISK_KEYS = ['vix', 'hy_spread', 'urgent_posts', 'conflict_events', 'therm
|
||||
// ─── Semantic Hashing for Telegram Posts ─────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Produce a normalized hash of a post's content.
|
||||
* Strips timestamps, normalizes numbers, lowercases — so "BREAKING: 5 missiles at 14:32"
|
||||
* and "Breaking: 7 missiles at 15:01" produce the same hash (both are "missile strike" signals).
|
||||
* Produce a normalized semantic hash of a post's content.
|
||||
* This is intentionally lossy and is only safe as a fallback when a stable
|
||||
* post identity is unavailable.
|
||||
*/
|
||||
function contentHash(text) {
|
||||
if (!text) return '';
|
||||
@@ -83,12 +83,32 @@ function contentHash(text) {
|
||||
return createHash('sha256').update(normalized).digest('hex').substring(0, 12);
|
||||
}
|
||||
|
||||
function stablePostKey(post) {
|
||||
if (!post) return '';
|
||||
|
||||
const sourceId = post.postId || post.messageId || '';
|
||||
const channelId = post.channel || post.chat || '';
|
||||
const date = post.date || '';
|
||||
const text = (post.text || '').trim().substring(0, 200);
|
||||
|
||||
if (sourceId) return `id:${sourceId}`;
|
||||
if (channelId && date) {
|
||||
return createHash('sha256')
|
||||
.update(`${channelId}|${date}|${text}`)
|
||||
.digest('hex')
|
||||
.substring(0, 16);
|
||||
}
|
||||
|
||||
return `semantic:${contentHash(post.text)}`;
|
||||
}
|
||||
|
||||
// ─── Core Delta Computation ──────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* @param {object} current - current sweep's synthesized data
|
||||
* @param {object|null} previous - previous sweep's synthesized data (null on first run)
|
||||
* @param {object} [thresholdOverrides] - optional: { numeric: {...}, count: {...} }
|
||||
* @param {Array<object>} [priorRuns] - optional compacted prior runs for broader dedup
|
||||
*/
|
||||
export function computeDelta(current, previous, thresholdOverrides = {}, priorRuns = []) {
|
||||
if (!previous) return null;
|
||||
@@ -153,14 +173,16 @@ export function computeDelta(current, previous, thresholdOverrides = {}, priorRu
|
||||
// ─── New urgent Telegram posts (semantic dedup) ──────────────────────
|
||||
|
||||
// Dedup against all recent runs (not just the last one) to catch posts that
|
||||
// drop out of one sweep but reappear in a later one.
|
||||
// drop out of one sweep but reappear in a later one. Use stable post identity
|
||||
// where possible so updated posts are not collapsed into earlier alerts just
|
||||
// because their text is semantically similar.
|
||||
const sources = priorRuns.length > 0 ? priorRuns : [previous];
|
||||
const prevHashes = new Set(
|
||||
sources.flatMap(run => (run?.tg?.urgent || []).map(p => contentHash(p.text)))
|
||||
sources.flatMap(run => (run?.tg?.urgent || []).map(stablePostKey)).filter(Boolean)
|
||||
);
|
||||
|
||||
for (const post of (current.tg?.urgent || [])) {
|
||||
const hash = contentHash(post.text);
|
||||
const hash = stablePostKey(post);
|
||||
if (hash && !prevHashes.has(hash)) {
|
||||
signals.new.push({
|
||||
key: `tg_urgent:${hash}`,
|
||||
|
||||
@@ -201,7 +201,15 @@ export class MemoryManager {
|
||||
bls: data.bls,
|
||||
treasury: data.treasury,
|
||||
gscpi: data.gscpi,
|
||||
tg: { posts: data.tg?.posts, urgent: (data.tg?.urgent || []).map(p => ({ text: p.text, date: p.date })) },
|
||||
tg: {
|
||||
posts: data.tg?.posts,
|
||||
urgent: (data.tg?.urgent || []).map(p => ({
|
||||
text: p.text,
|
||||
date: p.date,
|
||||
channel: p.channel || p.chat || null,
|
||||
postId: p.postId || null,
|
||||
})),
|
||||
},
|
||||
thermal: (data.thermal || []).map(t => ({ region: t.region, det: t.det, night: t.night, hc: t.hc })),
|
||||
air: (data.air || []).map(a => ({ region: a.region, total: a.total })),
|
||||
nuke: (data.nuke || []).map(n => ({ site: n.site, anom: n.anom, cpm: n.cpm })),
|
||||
|
||||
Reference in New Issue
Block a user