Files
2026-06-11 19:18:19 -04:00

44 lines
1.6 KiB
TypeScript

/**
* Minimal RSS/Atom extraction — enough for EDGAR atom feeds and PR-wire RSS.
* Deliberately dependency-free; if a feed outgrows this, swap in
* fast-xml-parser without touching the pollers' output shape.
*/
export class RssParser {
/** Extract raw <item>…</item> or <entry>…</entry> blocks. */
static blocks(xml: string, tag: 'item' | 'entry'): string[] {
const re = new RegExp(`<${tag}[\\s>][\\s\\S]*?<\\/${tag}>`, 'g');
return xml.match(re) ?? [];
}
/** First occurrence of a simple tag's text content, entity-decoded. */
static tag(block: string, name: string): string | null {
const re = new RegExp(`<${name}[^>]*>([\\s\\S]*?)<\\/${name}>`, 'i');
const m = block.match(re);
return m ? RssParser.clean(m[1]) : null;
}
/** Atom-style <link href="…"/> (self-closing) or RSS <link>…</link>. */
static link(block: string): string | null {
const href = block.match(/<link[^>]*href="([^"]+)"/i);
if (href) return RssParser.decode(href[1].trim());
return RssParser.tag(block, 'link');
}
private static clean(raw: string): string {
const noCdata = raw.replace(/<!\[CDATA\[([\s\S]*?)\]\]>/g, '$1');
const noTags = noCdata.replace(/<[^>]+>/g, ' ');
return RssParser.decode(noTags).replace(/\s+/g, ' ').trim();
}
private static decode(s: string): string {
return s
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#0?39;/g, "'")
.replace(/&apos;/g, "'")
.replace(/&#(\d+);/g, (_, n) => String.fromCharCode(Number(n)));
}
}