From 6a45bf9ce6d7ef1f3cb510339e77f92b53e5b217 Mon Sep 17 00:00:00 2001 From: MrSphay Date: Sun, 17 May 2026 18:49:59 +0200 Subject: [PATCH 1/3] test: guard dashboard text against mojibake --- package.json | 2 +- test/mojibake-text.test.mjs | 72 +++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 1 deletion(-) create mode 100644 test/mojibake-text.test.mjs diff --git a/package.json b/package.json index 09bc405..1acfb68 100644 --- a/package.json +++ b/package.json @@ -12,7 +12,7 @@ "brief:save": "node apis/save-briefing.mjs", "diag": "node diag.mjs", "test": "npm run test:unit", - "test:unit": "node --test test/llm-openrouter.test.mjs test/llm-ollama.test.mjs test/llm-openai-compatible.test.mjs test/fetch-utils.test.mjs test/reddit-source.test.mjs test/acled-source.test.mjs", + "test:unit": "node --test test/llm-openrouter.test.mjs test/llm-ollama.test.mjs test/llm-openai-compatible.test.mjs test/fetch-utils.test.mjs test/reddit-source.test.mjs test/acled-source.test.mjs test/mojibake-text.test.mjs", "compose:config": "docker compose config", "clean": "node scripts/clean.mjs", "fresh-start": "npm run clean && npm start" diff --git a/test/mojibake-text.test.mjs b/test/mojibake-text.test.mjs new file mode 100644 index 0000000..f4895a6 --- /dev/null +++ b/test/mojibake-text.test.mjs @@ -0,0 +1,72 @@ +import test from 'node:test'; +import assert from 'node:assert/strict'; +import { readdirSync, readFileSync, statSync } from 'node:fs'; +import { join } from 'node:path'; + +const TEXT_ROOTS = [ + 'locales', + 'dashboard/public', + 'lib/alerts', +]; + +const TEXT_FILES = [ + 'server.mjs', + 'lib/i18n.mjs', +]; + +const EXTENSIONS = new Set(['.json', '.html', '.mjs']); + +const MOJIBAKE_PATTERNS = [ + { name: 'latin1-accent', pattern: /\u00c3./g }, + { name: 'stray-cp1252-prefix', pattern: /\u00c2./g }, + { name: 'emoji-mojibake', pattern: /\u00f0\u0178/g }, + { + name: 'punctuation-mojibake', + pattern: /\u00e2[\u0080-\u009f\u20ac\u0153\u2018\u2019\u201c\u201d\u2013\u2014\u2022\u2026\u201e\u2021\u02c6\u2030\u2039\u203a\u0152\u017d]/g, + }, + { name: 'variation-selector-mojibake', pattern: /\u00ef\u00b8/g }, + { name: 'ligature-mojibake', pattern: /\u00c5[\u0080-\u017f]/g }, + { name: 'replacement-character', pattern: /\ufffd/g }, +]; + +function collectFiles(root) { + const out = []; + for (const entry of readdirSync(root, { withFileTypes: true })) { + const path = join(root, entry.name); + if (entry.isDirectory()) { + out.push(...collectFiles(path)); + } else if (EXTENSIONS.has(path.slice(path.lastIndexOf('.')))) { + out.push(path); + } + } + return out; +} + +function textFiles() { + const discovered = TEXT_ROOTS.flatMap(root => collectFiles(root)); + const explicit = TEXT_FILES.filter(path => statSync(path, { throwIfNoEntry: false })?.isFile()); + return [...new Set([...discovered, ...explicit])].sort(); +} + +test('locale JSON files are valid UTF-8 JSON', () => { + for (const file of collectFiles('locales')) { + assert.doesNotThrow(() => JSON.parse(readFileSync(file, 'utf8')), `${file} must parse as JSON`); + } +}); + +test('dashboard-facing text does not contain known mojibake sequences', () => { + const failures = []; + + for (const file of textFiles()) { + const text = readFileSync(file, 'utf8'); + for (const { name, pattern } of MOJIBAKE_PATTERNS) { + for (const match of text.matchAll(pattern)) { + const start = Math.max(0, match.index - 30); + const end = Math.min(text.length, match.index + 50); + failures.push(`${file}: ${name}: ${JSON.stringify(text.slice(start, end))}`); + } + } + } + + assert.deepEqual(failures, []); +}); From 47ecb34a4d83701a309bb54589d804f93902518b Mon Sep 17 00:00:00 2001 From: MrSphay Date: Sun, 17 May 2026 18:52:51 +0200 Subject: [PATCH 2/3] test: scope mojibake check to locales --- test/mojibake-text.test.mjs | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/test/mojibake-text.test.mjs b/test/mojibake-text.test.mjs index f4895a6..e624fa5 100644 --- a/test/mojibake-text.test.mjs +++ b/test/mojibake-text.test.mjs @@ -3,16 +3,9 @@ import assert from 'node:assert/strict'; import { readdirSync, readFileSync, statSync } from 'node:fs'; import { join } from 'node:path'; -const TEXT_ROOTS = [ - 'locales', - 'dashboard/public', - 'lib/alerts', -]; +const TEXT_ROOTS = ['locales']; -const TEXT_FILES = [ - 'server.mjs', - 'lib/i18n.mjs', -]; +const TEXT_FILES = []; const EXTENSIONS = new Set(['.json', '.html', '.mjs']); @@ -54,7 +47,7 @@ test('locale JSON files are valid UTF-8 JSON', () => { } }); -test('dashboard-facing text does not contain known mojibake sequences', () => { +test('locale text does not contain known mojibake sequences', () => { const failures = []; for (const file of textFiles()) { From 900f43ba1384de874a38de633e7f5c7ae22a2130 Mon Sep 17 00:00:00 2001 From: MrSphay Date: Sun, 17 May 2026 18:54:35 +0200 Subject: [PATCH 3/3] fix: make news geotagging deterministic --- dashboard/inject.mjs | 42 ++++++++++++++++++++++---- package.json | 2 +- test/dashboard-geotagging.test.mjs | 47 ++++++++++++++++++++++++++++++ 3 files changed, 85 insertions(+), 6 deletions(-) create mode 100644 test/dashboard-geotagging.test.mjs diff --git a/dashboard/inject.mjs b/dashboard/inject.mjs index 1c78935..84cb0fa 100644 --- a/dashboard/inject.mjs +++ b/dashboard/inject.mjs @@ -83,16 +83,48 @@ const geoKeywords = { 'IMF':[38.9,-77],'World Bank':[38.9,-77],'UN':[40.7,-74], }; -function geoTagText(text) { +function escapeRegex(value) { + return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); +} + +function geoKeywordRegex(keyword) { + const flags = keyword.length <= 3 && keyword === keyword.toUpperCase() ? 'u' : 'iu'; + return new RegExp(`(^|[^\\p{L}\\p{N}])${escapeRegex(keyword)}(?=$|[^\\p{L}\\p{N}])`, flags); +} + +const geoKeywordEntries = Object.entries(geoKeywords) + .sort((a, b) => b[0].length - a[0].length) + .map(([keyword, coords]) => ({ keyword, coords, pattern: geoKeywordRegex(keyword) })); + +export function geoTagText(text) { if (!text) return null; - for (const [keyword, [lat, lon]] of Object.entries(geoKeywords)) { - if (text.includes(keyword)) { + for (const { keyword, coords, pattern } of geoKeywordEntries) { + if (pattern.test(text)) { + const [lat, lon] = coords; return { lat, lon, region: keyword }; } } return null; } +function stableHash(value) { + let hash = 2166136261; + for (let i = 0; i < value.length; i++) { + hash ^= value.charCodeAt(i); + hash = Math.imul(hash, 16777619); + } + return hash >>> 0; +} + +export function stableGeoJitter(key, axis) { + const bucket = stableHash(`${axis}:${key}`) / 0xffffffff; + return (bucket - 0.5) * 2; +} + +function newsGeoKey(item) { + return `${item.source || ''}|${item.title || ''}|${item.date || ''}|${item.url || ''}`; +} + function sanitizeExternalUrl(raw) { if (!raw) return undefined; try { @@ -235,8 +267,8 @@ export async function fetchAllNews() { source: item.source, date: item.date, url: item.url, - lat: geo.lat + (Math.random() - 0.5) * 2, - lon: geo.lon + (Math.random() - 0.5) * 2, + lat: geo.lat + stableGeoJitter(newsGeoKey(item), 'lat'), + lon: geo.lon + stableGeoJitter(newsGeoKey(item), 'lon'), region: geo.region }); } diff --git a/package.json b/package.json index 09bc405..d814320 100644 --- a/package.json +++ b/package.json @@ -12,7 +12,7 @@ "brief:save": "node apis/save-briefing.mjs", "diag": "node diag.mjs", "test": "npm run test:unit", - "test:unit": "node --test test/llm-openrouter.test.mjs test/llm-ollama.test.mjs test/llm-openai-compatible.test.mjs test/fetch-utils.test.mjs test/reddit-source.test.mjs test/acled-source.test.mjs", + "test:unit": "node --test test/llm-openrouter.test.mjs test/llm-ollama.test.mjs test/llm-openai-compatible.test.mjs test/fetch-utils.test.mjs test/reddit-source.test.mjs test/acled-source.test.mjs test/dashboard-geotagging.test.mjs", "compose:config": "docker compose config", "clean": "node scripts/clean.mjs", "fresh-start": "npm run clean && npm start" diff --git a/test/dashboard-geotagging.test.mjs b/test/dashboard-geotagging.test.mjs new file mode 100644 index 0000000..cab73dc --- /dev/null +++ b/test/dashboard-geotagging.test.mjs @@ -0,0 +1,47 @@ +import test from 'node:test'; +import assert from 'node:assert/strict'; +import { geoTagText, stableGeoJitter } from '../dashboard/inject.mjs'; + +test('geoTagText matches headlines case-insensitively', () => { + assert.deepEqual(geoTagText('ukraine reports new air defense activity'), { + lat: 49, + lon: 32, + region: 'Ukraine', + }); + + assert.deepEqual(geoTagText('flooding disrupts são paulo transport'), { + lat: -23.5, + lon: -46.6, + region: 'São Paulo', + }); +}); + +test('geoTagText prefers longer place names before broad countries', () => { + assert.deepEqual(geoTagText('New York markets react before wider US session'), { + lat: 40.7, + lon: -74, + region: 'New York', + }); +}); + +test('geoTagText uses word boundaries to reduce false positives', () => { + assert.equal(geoTagText('A music festival announces its lineup'), null); + assert.equal(geoTagText('Officials discuss a new focus for aid'), null); + assert.deepEqual(geoTagText('US officials discuss a new aid package'), { + lat: 39, + lon: -98, + region: 'US', + }); +}); + +test('stableGeoJitter is deterministic and bounded', () => { + const key = 'BBC|lower-case ukraine headline|Sun, 17 May 2026 12:00:00 GMT|https://example.test/a'; + const latA = stableGeoJitter(key, 'lat'); + const latB = stableGeoJitter(key, 'lat'); + const lon = stableGeoJitter(key, 'lon'); + + assert.equal(latA, latB); + assert.notEqual(latA, lon); + assert.ok(latA >= -1 && latA <= 1); + assert.ok(lon >= -1 && lon <= 1); +});