market_screener/server/domains/news/rss.ts

/**
 * Minimal RSS/Atom extraction — enough for EDGAR atom feeds and PR-wire RSS.
 * Deliberately dependency-free; if a feed outgrows this, swap in
 * fast-xml-parser without touching the pollers' output shape.
 */
export class RssParser {
  /** Extract raw <item>…</item> or <entry>…</entry> blocks. */
  static blocks(xml: string, tag: 'item' | 'entry'): string[] {
    const re = new RegExp(`<${tag}[\\s>][\\s\\S]*?<\\/${tag}>`, 'g');
    return xml.match(re) ?? [];
  }

  /** First occurrence of a simple tag's text content, entity-decoded. */
  static tag(block: string, name: string): string | null {
    const re = new RegExp(`<${name}[^>]*>([\\s\\S]*?)<\\/${name}>`, 'i');
    const m = block.match(re);
    return m ? RssParser.clean(m[1]) : null;
  }

  /** Atom-style <link href="…"/> (self-closing) or RSS <link>…</link>. */
  static link(block: string): string | null {
    const href = block.match(/<link[^>]*href="([^"]+)"/i);
    if (href) return RssParser.decode(href[1].trim());
    return RssParser.tag(block, 'link');
  }

  private static clean(raw: string): string {
    const noCdata = raw.replace(/<!\[CDATA\[([\s\S]*?)\]\]>/g, '$1');
    const noTags = noCdata.replace(/<[^>]+>/g, ' ');
    return RssParser.decode(noTags).replace(/\s+/g, ' ').trim();
  }

  private static decode(s: string): string {
    return s
      .replace(/&amp;/g, '&')
      .replace(/&lt;/g, '<')
      .replace(/&gt;/g, '>')
      .replace(/&quot;/g, '"')
      .replace(/&#0?39;/g, "'")
      .replace(/&apos;/g, "'")
      .replace(/&#(\d+);/g, (_, n) => String.fromCharCode(Number(n)));
  }
}