44 lines
1.6 KiB
TypeScript
44 lines
1.6 KiB
TypeScript
/**
|
|
* Minimal RSS/Atom extraction — enough for EDGAR atom feeds and PR-wire RSS.
|
|
* Deliberately dependency-free; if a feed outgrows this, swap in
|
|
* fast-xml-parser without touching the pollers' output shape.
|
|
*/
|
|
export class RssParser {
|
|
/** Extract raw <item>…</item> or <entry>…</entry> blocks. */
|
|
static blocks(xml: string, tag: 'item' | 'entry'): string[] {
|
|
const re = new RegExp(`<${tag}[\\s>][\\s\\S]*?<\\/${tag}>`, 'g');
|
|
return xml.match(re) ?? [];
|
|
}
|
|
|
|
/** First occurrence of a simple tag's text content, entity-decoded. */
|
|
static tag(block: string, name: string): string | null {
|
|
const re = new RegExp(`<${name}[^>]*>([\\s\\S]*?)<\\/${name}>`, 'i');
|
|
const m = block.match(re);
|
|
return m ? RssParser.clean(m[1]) : null;
|
|
}
|
|
|
|
/** Atom-style <link href="…"/> (self-closing) or RSS <link>…</link>. */
|
|
static link(block: string): string | null {
|
|
const href = block.match(/<link[^>]*href="([^"]+)"/i);
|
|
if (href) return RssParser.decode(href[1].trim());
|
|
return RssParser.tag(block, 'link');
|
|
}
|
|
|
|
private static clean(raw: string): string {
|
|
const noCdata = raw.replace(/<!\[CDATA\[([\s\S]*?)\]\]>/g, '$1');
|
|
const noTags = noCdata.replace(/<[^>]+>/g, ' ');
|
|
return RssParser.decode(noTags).replace(/\s+/g, ' ').trim();
|
|
}
|
|
|
|
private static decode(s: string): string {
|
|
return s
|
|
.replace(/&/g, '&')
|
|
.replace(/</g, '<')
|
|
.replace(/>/g, '>')
|
|
.replace(/"/g, '"')
|
|
.replace(/�?39;/g, "'")
|
|
.replace(/'/g, "'")
|
|
.replace(/&#(\d+);/g, (_, n) => String.fromCharCode(Number(n)));
|
|
}
|
|
}
|