phase-10.5: screener enhancements
This commit is contained in:
@@ -0,0 +1,43 @@
|
||||
/**
|
||||
* Minimal RSS/Atom extraction — enough for EDGAR atom feeds and PR-wire RSS.
|
||||
* Deliberately dependency-free; if a feed outgrows this, swap in
|
||||
* fast-xml-parser without touching the pollers' output shape.
|
||||
*/
|
||||
export class RssParser {
|
||||
/** Extract raw <item>…</item> or <entry>…</entry> blocks. */
|
||||
static blocks(xml: string, tag: 'item' | 'entry'): string[] {
|
||||
const re = new RegExp(`<${tag}[\\s>][\\s\\S]*?<\\/${tag}>`, 'g');
|
||||
return xml.match(re) ?? [];
|
||||
}
|
||||
|
||||
/** First occurrence of a simple tag's text content, entity-decoded. */
|
||||
static tag(block: string, name: string): string | null {
|
||||
const re = new RegExp(`<${name}[^>]*>([\\s\\S]*?)<\\/${name}>`, 'i');
|
||||
const m = block.match(re);
|
||||
return m ? RssParser.clean(m[1]) : null;
|
||||
}
|
||||
|
||||
/** Atom-style <link href="…"/> (self-closing) or RSS <link>…</link>. */
|
||||
static link(block: string): string | null {
|
||||
const href = block.match(/<link[^>]*href="([^"]+)"/i);
|
||||
if (href) return RssParser.decode(href[1].trim());
|
||||
return RssParser.tag(block, 'link');
|
||||
}
|
||||
|
||||
private static clean(raw: string): string {
|
||||
const noCdata = raw.replace(/<!\[CDATA\[([\s\S]*?)\]\]>/g, '$1');
|
||||
const noTags = noCdata.replace(/<[^>]+>/g, ' ');
|
||||
return RssParser.decode(noTags).replace(/\s+/g, ' ').trim();
|
||||
}
|
||||
|
||||
private static decode(s: string): string {
|
||||
return s
|
||||
.replace(/&/g, '&')
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/"/g, '"')
|
||||
.replace(/�?39;/g, "'")
|
||||
.replace(/'/g, "'")
|
||||
.replace(/&#(\d+);/g, (_, n) => String.fromCharCode(Number(n)));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user