intelligence-terminal/apis/sources/telegram.mjs

// Telegram — public channel intelligence from conflict zones and OSINT analysts
// Primary mode: Bot API with TELEGRAM_BOT_TOKEN (getUpdates, getChat)
// Fallback mode: Scrape public channel web previews at https://t.me/s/{channel}
// Monitors conflict zones (Ukraine, Middle East), geopolitics, and OSINT channels.

import { safeFetch } from '../utils/fetch.mjs';
import '../utils/env.mjs';

function delay(ms) { return new Promise(r => setTimeout(r, ms)); }

// Curated list of well-known public OSINT / conflict / geopolitics channels
// All verified to have public web previews enabled at https://t.me/s/{id}
// Override with TELEGRAM_CHANNELS env var (comma-separated channel IDs)
const DEFAULT_CHANNELS = [
  // === Conflict: Ukraine/Russia ===
  { id: 'intelslava',        label: 'Intel Slava Z',       topic: 'conflict',    note: 'Conflict updates, pro-Russian perspective' },
  { id: 'legitimniy',        label: 'Legitimniy',          topic: 'conflict',    note: 'Ukrainian politics & conflict analysis' },
  { id: 'wartranslated',     label: 'War Translated',      topic: 'conflict',    note: 'Conflict translations & OSINT' },
  { id: 'ukraine_frontline', label: 'Ukraine Frontline',   topic: 'conflict',    note: 'Frontline situation updates' },
  { id: 'mod_russia',        label: 'Russian MoD',         topic: 'conflict',    note: 'Russian Ministry of Defense official' },
  { id: 'CIG_telegram',      label: 'Conflict Intel Team', topic: 'osint',       note: 'Conflict Intelligence Team analysis' },
  { id: 'RVvoenkor',         label: 'Voenkor RV',          topic: 'conflict',    note: 'Russian military correspondent' },
  { id: 'readovkanews',      label: 'Readovka',            topic: 'conflict',    note: 'Russian conflict news aggregator' },
  { id: 'DeepStateUA',       label: 'DeepState Ukraine',   topic: 'conflict',    note: 'Ukrainian frontline maps & analysis' },
  { id: 'operativnoZSU',     label: 'ZSU Operative',       topic: 'conflict',    note: 'Ukrainian armed forces updates' },
  { id: 'GeneralStaffZSU',   label: 'General Staff ZSU',   topic: 'conflict',    note: 'Ukrainian General Staff official' },
  // === Middle East ===
  { id: 'middleeastosint',   label: 'Middle East OSINT',   topic: 'osint',       note: 'Middle East open source intel' },
  { id: 'inikiforv',         label: 'Nikiforov OSINT',     topic: 'osint',       note: 'Cross-regional OSINT analyst' },
  // === Geopolitics & Analysis ===
  { id: 'geaborning',        label: 'Geo A. Borning',      topic: 'geopolitics', note: 'Geopolitical analysis and forecasting' },
  { id: 'TheIntelligencer',  label: 'The Intelligencer',   topic: 'osint',       note: 'Intelligence community news' },
  // === Markets & Finance ===
  { id: 'WallStreetSilver',  label: 'Wall St Silver',      topic: 'finance',     note: 'Commodities and macro commentary' },
  { id: 'unusual_whales',    label: 'Unusual Whales',      topic: 'finance',     note: 'Market flow and options analysis' },
];

// Allow user to add custom channels via env var
function loadChannels() {
  const custom = process.env.TELEGRAM_CHANNELS;
  if (!custom) return DEFAULT_CHANNELS;

  const customIds = custom.split(',').map(s => s.trim()).filter(Boolean);
  const existing = new Set(DEFAULT_CHANNELS.map(c => c.id));

  const extras = customIds
    .filter(id => !existing.has(id))
    .map(id => ({ id, label: id, topic: 'custom', note: 'User-added channel' }));

  return [...DEFAULT_CHANNELS, ...extras];
}

const CHANNELS = loadChannels();

// Urgent keywords that flag high-priority posts
// Organized by domain for maintainability
const URGENT_KEYWORDS = [
  // Breaking / meta urgency
  'breaking', 'urgent', 'alert', 'confirmed', 'just in', 'flash',
  // Military / kinetic
  'missile', 'strike', 'explosion', 'airstrike', 'drone', 'bombardment',
  'shelling', 'intercept', 'ICBM', 'hypersonic', 'F-16', 'ATACMS', 'HIMARS',
  // Escalation / de-escalation
  'nuclear', 'chemical', 'biological', 'ceasefire', 'escalation', 'invasion',
  'offensive', 'retreat', 'advance', 'mobilization', 'martial law',
  // Geopolitical
  'nato', 'coup', 'assassination', 'sanctions', 'embargo', 'blockade',
  'summit', 'ultimatum', 'declaration of war', 'peace deal',
  // Casualty / humanitarian
  'casualties', 'killed', 'wounded', 'evacuation', 'refugee', 'humanitarian',
  // Infrastructure / cyber
  'blackout', 'sabotage', 'cyberattack', 'pipeline', 'dam', 'nuclear plant',
  // Financial crisis
  'default', 'bank run', 'circuit breaker', 'flash crash', 'emergency rate',
];

// ─── Bot API mode ───────────────────────────────────────────────────────────

const botBase = () => `https://api.telegram.org/bot${process.env.TELEGRAM_BOT_TOKEN}`;

// Get recent updates the bot has received
export async function getUpdates(opts = {}) {
  const { limit = 100, offset = 0 } = opts;
  const params = new URLSearchParams({ limit: String(limit), offset: String(offset) });
  return safeFetch(`${botBase()}/getUpdates?${params}`);
}

// Get info about a chat/channel by username
export async function getChat(chatId) {
  const params = new URLSearchParams({ chat_id: chatId.startsWith('@') ? chatId : `@${chatId}` });
  return safeFetch(`${botBase()}/getChat?${params}`);
}

// Compact a Bot API message for briefing output
function compactBotMessage(msg) {
  return {
    text: msg.text || msg.caption || '',
    date: msg.date ? new Date(msg.date * 1000).toISOString() : null,
    chat: msg.chat?.title || msg.chat?.username || 'unknown',
    views: msg.views || 0,
    hasMedia: !!(msg.photo || msg.video || msg.document),
  };
}

// Fetch updates via Bot API and organize by channel
async function fetchBotUpdates() {
  const result = await getUpdates({ limit: 100 });
  if (!result?.ok || !Array.isArray(result.result)) {
    return { error: result?.description || 'Bot API request failed' };
  }

  const messages = result.result
    .map(u => u.message || u.channel_post || u.edited_channel_post)
    .filter(Boolean)
    .map(compactBotMessage);

  return { messages, count: messages.length };
}

// ─── Web preview scraping fallback ──────────────────────────────────────────

// Fetch raw HTML from a URL (safeFetch truncates non-JSON to 500 chars, too short)
async function fetchHTML(url, timeoutMs = 15000) {
  const controller = new AbortController();
  const timer = setTimeout(() => controller.abort(), timeoutMs);
  try {
    const res = await fetch(url, {
      signal: controller.signal,
      headers: {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept-Language': 'en-US,en;q=0.9',
      },
    });
    clearTimeout(timer);
    if (!res.ok) throw new Error(`HTTP ${res.status}`);
    return await res.text();
  } catch (e) {
    clearTimeout(timer);
    return null;
  }
}

// Parse messages from Telegram web preview HTML (https://t.me/s/channel)
// The HTML contains <div class="tgme_widget_message_wrap"> blocks with message content.
function parseWebPreview(html, channelId) {
  if (!html) return [];

  const messages = [];

  // Each message sits inside a tgme_widget_message_wrap div
  // We extract using the data-post attribute which has the format "channel/msgId"
  const msgBlockRegex = /class="tgme_widget_message_wrap[^"]*"[\s\S]*?data-post="([^"]*)"([\s\S]*?)(?=class="tgme_widget_message_wrap|$)/gi;
  // Simpler: split on message boundaries using data-post
  const postRegex = /data-post="([^"]+)"([\s\S]*?)(?=data-post="|$)/gi;

  let match;
  while ((match = postRegex.exec(html)) !== null && messages.length < 20) {
    const postId = match[1]; // e.g. "intelslava/12345"
    const block = match[2];

    // Extract message text from tgme_widget_message_text
    const textMatch = block.match(/class="tgme_widget_message_text[^"]*"[^>]*>([\s\S]*?)<\/div>/i);
    let text = '';
    if (textMatch) {
      text = textMatch[1]
        .replace(/<br\s*\/?>/gi, '\n')     // preserve line breaks
        .replace(/<[^>]+>/g, '')            // strip HTML tags
        .replace(/&amp;/g, '&')
        .replace(/&lt;/g, '<')
        .replace(/&gt;/g, '>')
        .replace(/&quot;/g, '"')
        .replace(/&#039;/g, "'")
        .replace(/&nbsp;/g, ' ')
        .trim();
    }

    // Extract view count
    const viewsMatch = block.match(/class="tgme_widget_message_views"[^>]*>([\s\S]*?)<\/span>/i);
    let views = 0;
    if (viewsMatch) {
      const raw = viewsMatch[1].trim();
      if (raw.endsWith('K')) views = parseFloat(raw) * 1000;
      else if (raw.endsWith('M')) views = parseFloat(raw) * 1000000;
      else views = parseInt(raw, 10) || 0;
    }

    // Extract datetime
    const timeMatch = block.match(/datetime="([^"]+)"/i);
    const date = timeMatch ? timeMatch[1] : null;

    // Check for media (photos, videos)
    const hasMedia = /tgme_widget_message_photo|tgme_widget_message_video/i.test(block);

    if (text || hasMedia) {
      messages.push({
        postId,
        text,
        date,
        views,
        hasMedia,
        channel: channelId,
      });
    }
  }

  return messages;
}

// Scrape a single channel's web preview
async function scrapeChannel(channelId) {
  const url = `https://t.me/s/${channelId}`;
  const html = await fetchHTML(url);
  if (!html) return { channel: channelId, error: 'Failed to fetch', posts: [] };

  // Extract channel title from page
  const titleMatch = html.match(/class="tgme_channel_info_header_title[^"]*"[^>]*>([\s\S]*?)<\/span>/i)
    || html.match(/<title>(.*?)<\/title>/i);
  const title = titleMatch
    ? titleMatch[1].replace(/<[^>]+>/g, '').trim()
    : channelId;

  const posts = parseWebPreview(html, channelId);

  return { channel: channelId, title, posts, postCount: posts.length };
}

// ─── Analysis helpers ───────────────────────────────────────────────────────

// Flag urgent/high-priority posts
function flagUrgent(post) {
  const lower = (post.text || '').toLowerCase();
  const matched = URGENT_KEYWORDS.filter(k => lower.includes(k));
  return matched.length > 0 ? matched : null;
}

// Score a post's significance (views + urgency + length)
function significanceScore(post) {
  let score = 0;
  score += Math.min(post.views / 1000, 50);              // views weight (capped)
  const urgentFlags = flagUrgent(post);
  if (urgentFlags) score += urgentFlags.length * 10;       // urgency weight
  if (post.text?.length > 100) score += 5;                 // substantive text bonus
  if (post.hasMedia) score += 3;                           // media bonus
  return score;
}

// Group posts by topic based on the channel config
function groupByTopic(allPosts, channelMeta) {
  const groups = {};
  for (const post of allPosts) {
    const meta = channelMeta.find(c => c.id === post.channel);
    const topic = meta?.topic || 'other';
    if (!groups[topic]) groups[topic] = [];
    groups[topic].push(post);
  }
  return groups;
}

// ─── Briefing ───────────────────────────────────────────────────────────────

export async function briefing() {
  const token = process.env.TELEGRAM_BOT_TOKEN;

  // Try Bot API first if token is available
  if (token) {
    try {
      const botData = await fetchBotUpdates();
      if (!botData.error && botData.count > 0) {
        const enriched = botData.messages.map(m => ({
          ...m,
          urgentFlags: flagUrgent(m),
          score: significanceScore(m),
        }));

        const urgent = enriched.filter(m => m.urgentFlags).sort((a, b) => b.score - a.score);
        const top = enriched.sort((a, b) => b.score - a.score).slice(0, 15);

        return {
          source: 'Telegram',
          timestamp: new Date().toISOString(),
          status: 'bot_api',
          totalMessages: botData.count,
          urgentPosts: urgent.slice(0, 10),
          topPosts: top,
          note: 'Data from Bot API getUpdates. Bot must be added to channels to receive posts.',
        };
      }
      // If bot returned no messages, fall through to web scraping
    } catch { /* fall through to scraping */ }
  }

  // Fallback: scrape public channel web previews (no auth needed)
  const results = [];
  const errors = [];

  // Fetch channels in batches of 3 to avoid rate limiting
  for (let i = 0; i < CHANNELS.length; i += 3) {
    const batch = CHANNELS.slice(i, i + 3);
    const batchResults = await Promise.all(
      batch.map(ch => scrapeChannel(ch.id))
    );
    results.push(...batchResults);

    // Delay between batches to be respectful
    if (i + 3 < CHANNELS.length) await delay(1500);
  }

  // Collect all posts and separate errors
  const allPosts = [];
  const channelSummaries = [];

  for (const r of results) {
    const meta = CHANNELS.find(c => c.id === r.channel);
    if (r.error) {
      errors.push({ channel: r.channel, error: r.error });
    }
    // Enrich posts with urgency flags and scores
    const enriched = (r.posts || []).map(p => ({
      ...p,
      urgentFlags: flagUrgent(p),
      score: significanceScore(p),
    }));
    allPosts.push(...enriched);

    channelSummaries.push({
      channel: r.channel,
      title: r.title || meta?.label || r.channel,
      topic: meta?.topic || 'other',
      postCount: r.postCount || 0,
      reachable: !r.error,
    });
  }

  // Sort all posts by significance
  allPosts.sort((a, b) => b.score - a.score);

  // Separate urgent posts
  const urgentPosts = allPosts.filter(p => p.urgentFlags).slice(0, 15);

  // Group by topic
  const byTopic = groupByTopic(allPosts, CHANNELS);
  const topicSummary = {};
  for (const [topic, posts] of Object.entries(byTopic)) {
    topicSummary[topic] = {
      totalPosts: posts.length,
      urgentCount: posts.filter(p => p.urgentFlags).length,
      topPosts: posts.sort((a, b) => b.score - a.score).slice(0, 5),
    };
  }

  return {
    source: 'Telegram',
    timestamp: new Date().toISOString(),
    status: token ? 'bot_api_empty_fallback_scrape' : 'web_scrape',
    method: 'Public channel web preview scraping (no auth required)',
    channelsMonitored: channelSummaries.length,
    channelsReachable: channelSummaries.filter(c => c.reachable).length,
    totalPosts: allPosts.length,
    urgentPosts,
    byTopic: topicSummary,
    channels: channelSummaries,
    errors: errors.length > 0 ? errors : undefined,
    topPosts: allPosts.slice(0, 15),
    hint: token
      ? undefined
      : 'Set TELEGRAM_BOT_TOKEN in .env for Bot API access. Create a bot via @BotFather on Telegram.',
  };
}

// ─── CLI runner ─────────────────────────────────────────────────────────────

if (process.argv[1]?.endsWith('telegram.mjs')) {
  console.log('Telegram OSINT — fetching public channel intelligence...\n');
  const data = await briefing();
  console.log(JSON.stringify(data, null, 2));
}