/** * Minimal RSS/Atom extraction — enough for EDGAR atom feeds and PR-wire RSS. * Deliberately dependency-free; if a feed outgrows this, swap in * fast-xml-parser without touching the pollers' output shape. */ export class RssParser { /** Extract raw … or … blocks. */ static blocks(xml: string, tag: 'item' | 'entry'): string[] { const re = new RegExp(`<${tag}[\\s>][\\s\\S]*?<\\/${tag}>`, 'g'); return xml.match(re) ?? []; } /** First occurrence of a simple tag's text content, entity-decoded. */ static tag(block: string, name: string): string | null { const re = new RegExp(`<${name}[^>]*>([\\s\\S]*?)<\\/${name}>`, 'i'); const m = block.match(re); return m ? RssParser.clean(m[1]) : null; } /** Atom-style (self-closing) or RSS …. */ static link(block: string): string | null { const href = block.match(/]*href="([^"]+)"/i); if (href) return RssParser.decode(href[1].trim()); return RssParser.tag(block, 'link'); } private static clean(raw: string): string { const noCdata = raw.replace(//g, '$1'); const noTags = noCdata.replace(/<[^>]+>/g, ' '); return RssParser.decode(noTags).replace(/\s+/g, ' ').trim(); } private static decode(s: string): string { return s .replace(/&/g, '&') .replace(/</g, '<') .replace(/>/g, '>') .replace(/"/g, '"') .replace(/�?39;/g, "'") .replace(/'/g, "'") .replace(/&#(\d+);/g, (_, n) => String.fromCharCode(Number(n))); } }