phase-10.5: screener enhancements

This commit is contained in:
saikiranvella
2026-06-11 19:18:19 -04:00
parent bac00ab5d5
commit e953822bab
51 changed files with 3745 additions and 36 deletions
+165
View File
@@ -0,0 +1,165 @@
import { createHash } from 'crypto';
import { NewsRepository } from './NewsRepository';
import type { CatalystType, IngestStats, NormalizedStory } from '../shared/types';
/**
* Shared ingest pipeline (FREE-DATA-STACK §2) — every source flows through
* here: FILTER → DEDUPE → CLASSIFY → STORE. All drops happen BEFORE insert,
* cheapest check first, so the tables stay small by construction (§4).
*/
export class NewsPipeline {
/** §4.4 — max stories linked per ticker per day (filings exempt). */
private static readonly DAILY_CAP = 25;
/** §4.3 — syndicated-copy window for title dedupe. */
private static readonly TITLE_WINDOW_MS = 48 * 60 * 60 * 1000;
/** §4.2 — headlines with no decision value are never stored. */
private static readonly NOISE_PATTERNS: RegExp[] = [
/\b\d+\s+(?:best|top|hot)\s+stocks?\b/i,
/\bstocks?\s+to\s+(?:watch|buy|sell)\b/i,
/\bprice\s+target\s+(?:raised|lowered|reiterated|maintained)\b/i,
/\b(?:premarket|after-?hours?)\s+movers?\b/i,
/\bwhy\s+.{0,40}\s+stock\s+(?:jumped|popped|soared|plunged|tanked)\b/i,
/\bmotley\s+fool\b/i,
];
constructor(private readonly repo: NewsRepository) {}
/**
* Run a batch of normalized stories through the pipeline.
* `universe` is the tracked-ticker set from UniverseProvider.
*/
ingest(stories: NormalizedStory[], universe: Set<string>): IngestStats {
const stats: IngestStats = {
fetched: stories.length,
stored: 0,
droppedNoUniverseTicker: 0,
droppedNoise: 0,
droppedDuplicate: 0,
droppedCapped: 0,
};
for (const story of stories) {
this.ingestOne(story, universe, stats);
}
return stats;
}
private ingestOne(story: NormalizedStory, universe: Set<string>, stats: IngestStats): void {
const isFiling = story.source === 'edgar';
// 1. Universe filter — the big one (§4.1)
const tickers = [...new Set(story.tickers.map((t) => t.toUpperCase()))].filter((t) =>
universe.has(t),
);
if (tickers.length === 0) {
stats.droppedNoUniverseTicker++;
return;
}
// 2. Noise blocklist (§4.2) — filings are never noise
if (!isFiling && NewsPipeline.isNoise(story.headline)) {
stats.droppedNoise++;
return;
}
// 3. Dedupe (§4.3): url hash (storage-level PK) + recent title match
const urlHash = NewsPipeline.sha(story.url);
const titleHash = NewsPipeline.sha(NewsPipeline.normalizeTitle(story.headline));
const titleCutoff = new Date(Date.now() - NewsPipeline.TITLE_WINDOW_MS).toISOString();
if (this.repo.titleSeenSince(titleHash, titleCutoff)) {
stats.droppedDuplicate++;
return;
}
// 4. Per-ticker daily cap (§4.4) — filings keep priority past the cap
const day = story.publishedAt.slice(0, 10);
const eligible = isFiling
? tickers
: tickers.filter((t) => this.repo.countTickerDay(t, day) < NewsPipeline.DAILY_CAP);
if (eligible.length === 0) {
stats.droppedCapped++;
return;
}
// 5. Classify + store
const catalyst = story.catalystHint ?? NewsPipeline.classify(story.headline);
const inserted = this.repo.insertArticle({
urlHash,
titleHash,
tickers: eligible,
headline: story.headline.trim(),
body: story.body ?? null,
source: story.source,
catalyst,
url: story.url,
publishedAt: story.publishedAt,
});
if (!inserted) {
stats.droppedDuplicate++; // url_hash collision — already stored
return;
}
for (const ticker of eligible) {
this.repo.linkTicker(ticker, day, urlHash);
}
stats.stored++;
}
/** Retention jobs (§5) — call once daily. */
runRetention(now = new Date()): { bodiesPurged: number; rowsDeleted: number } {
const bodyCutoff = new Date(now.getTime() - 90 * 24 * 60 * 60 * 1000).toISOString();
const rowCutoff = new Date(now.getTime() - 548 * 24 * 60 * 60 * 1000).toISOString(); // ~18mo
return {
bodiesPurged: this.repo.purgeBodiesBefore(bodyCutoff),
rowsDeleted: this.repo.deleteUnreferencedBefore(rowCutoff),
};
}
// ── Pure helpers (exposed for tests) ──────────────────────────────────────
static isNoise(headline: string): boolean {
return NewsPipeline.NOISE_PATTERNS.some((re) => re.test(headline));
}
/**
* Keyword catalyst classifier. Order matters: M&A beats earnings
* ("acquisition closes in Q2" is an M&A story).
*/
static classify(headline: string): CatalystType | null {
const h = headline.toLowerCase();
if (
/\b(acqui[sr]|merger|takeover|buyout|tender offer|business combination|to be acquired)/.test(
h,
)
)
return 'ma';
if (/\b(guidance|outlook|forecast|raises full[- ]year|lowers full[- ]year)/.test(h))
return 'guidance';
if (
/\b(earnings|results|eps|quarterly report|q[1-4] (?:20\d\d|results)|fiscal (?:year|q[1-4]))/.test(
h,
)
)
return 'earnings';
if (
/\b(sec |fda|doj|ftc|antitrust|investigation|subpoena|lawsuit|settl|recall|approval)/.test(h)
)
return 'regulatory';
if (/\b(fed |fomc|inflation|cpi|jobs report|rate (?:cut|hike)|treasury yield)/.test(h))
return 'macro';
return null;
}
static normalizeTitle(title: string): string {
return title
.toLowerCase()
.replace(/[^a-z0-9 ]/g, '')
.replace(/\s+/g, ' ')
.trim();
}
private static sha(input: string): string {
return createHash('sha256').update(input).digest('hex');
}
}
+76
View File
@@ -0,0 +1,76 @@
import { DatabaseConnection } from '../shared/db/index';
import { QueryBuilder } from '../shared/utils/QueryBuilder';
import type { NewsArticleRow } from '../shared/types';
/**
* Persistence for the free-tier news pipeline (FREE-DATA-STACK §3).
* Pure data access — all filtering/dedupe decisions live in NewsPipeline.
*/
export class NewsRepository {
constructor(private readonly db: DatabaseConnection) {}
/** Returns true if the row was inserted (false = duplicate url_hash). */
insertArticle(a: {
urlHash: string;
titleHash: string;
tickers: string[];
headline: string;
body: string | null;
source: string;
catalyst: string | null;
url: string;
publishedAt: string;
}): boolean {
const qb = new QueryBuilder('NEWS_QUERIES.INSERT_ARTICLE', [
a.urlHash,
a.titleHash,
JSON.stringify(a.tickers),
a.headline,
a.body,
a.source,
a.catalyst,
a.url,
a.publishedAt,
new Date().toISOString(),
]);
return this.db.run(qb) > 0;
}
titleSeenSince(titleHash: string, sinceIso: string): boolean {
const qb = new QueryBuilder('NEWS_QUERIES.TITLE_SEEN_SINCE', [titleHash, sinceIso]);
return this.db.get(qb) != null;
}
linkTicker(ticker: string, day: string, urlHash: string): void {
const qb = new QueryBuilder('NEWS_QUERIES.INSERT_CATALYST_LINK', [ticker, day, urlHash]);
this.db.run(qb);
}
countTickerDay(ticker: string, day: string): number {
const qb = new QueryBuilder('NEWS_QUERIES.COUNT_TICKER_DAY', [ticker, day]);
return this.db.get<{ n: number }>(qb)?.n ?? 0;
}
newsForTicker(ticker: string, sinceDay: string): NewsArticleRow[] {
const qb = new QueryBuilder('NEWS_QUERIES.SELECT_TICKER_NEWS', [
ticker.toUpperCase(),
sinceDay,
]);
return this.db.all<NewsArticleRow>(qb);
}
recent(limit: number): NewsArticleRow[] {
const qb = new QueryBuilder('NEWS_QUERIES.SELECT_RECENT', [limit]);
return this.db.all<NewsArticleRow>(qb);
}
/** Retention: null out bodies older than cutoff. Returns rows changed. */
purgeBodiesBefore(cutoffIso: string): number {
return this.db.run(new QueryBuilder('NEWS_QUERIES.PURGE_BODIES_BEFORE', [cutoffIso]));
}
/** Retention: delete old rows no ticker references. Returns rows deleted. */
deleteUnreferencedBefore(cutoffIso: string): number {
return this.db.run(new QueryBuilder('NEWS_QUERIES.DELETE_UNREFERENCED_BEFORE', [cutoffIso]));
}
}
+106
View File
@@ -0,0 +1,106 @@
import { NewsPipeline } from './NewsPipeline';
import { UniverseProvider } from './UniverseProvider';
import { EdgarPoller } from './pollers/EdgarPoller';
import { PrWirePoller } from './pollers/PrWirePoller';
import type { IngestStats, Logger } from '../shared/types';
/**
* In-process polling scheduler (FREE-DATA-STACK §2). No Redis/BullMQ at the
* free tier — plain intervals, unref'd so they never hold the process open.
*
* Cadences: EDGAR 10 min, PR-wire 15 min, retention daily.
* Disable entirely with NEWS_POLL=off (e.g. when running bin/poll-news.ts
* from cron instead of inside the server).
*/
export class NewsScheduler {
private static readonly EDGAR_INTERVAL_MS = 10 * 60 * 1000;
private static readonly PRWIRE_INTERVAL_MS = 15 * 60 * 1000;
private static readonly RETENTION_INTERVAL_MS = 24 * 60 * 60 * 1000;
private timers: NodeJS.Timeout[] = [];
constructor(
private readonly pipeline: NewsPipeline,
private readonly universe: UniverseProvider,
private readonly edgar: EdgarPoller,
private readonly prwire: PrWirePoller,
private readonly logger: Logger,
) {}
start(): void {
if (this.timers.length > 0) return; // already running
const every = (ms: number, fn: () => void) => {
const t = setInterval(fn, ms);
t.unref(); // never keep the process alive just for polling
this.timers.push(t);
};
every(NewsScheduler.EDGAR_INTERVAL_MS, () => void this.runEdgar());
every(NewsScheduler.PRWIRE_INTERVAL_MS, () => void this.runPrWire());
every(NewsScheduler.RETENTION_INTERVAL_MS, () => this.runRetention());
// Prime once shortly after boot (delay keeps server startup fast)
const boot = setTimeout(() => void this.runOnce(), 15_000);
boot.unref();
this.timers.push(boot);
this.logger.log('News scheduler started (EDGAR 10m, PR-wire 15m, retention 24h)');
}
stop(): void {
for (const t of this.timers) clearInterval(t);
this.timers = [];
}
/** One full cycle of everything — used at boot and by bin/poll-news.ts. */
async runOnce(): Promise<{ edgar: IngestStats; prwire: IngestStats }> {
const edgar = await this.runEdgar();
const prwire = await this.runPrWire();
return { edgar, prwire };
}
private async runEdgar(): Promise<IngestStats> {
try {
const stories = await this.edgar.poll(this.universe.getUniverse());
const stats = this.pipeline.ingest(stories, this.universe.getUniverse());
if (stats.stored > 0) this.logger.log(`EDGAR: stored ${stats.stored}/${stats.fetched}`);
return stats;
} catch (err) {
this.logger.warn('EDGAR poll cycle failed:', (err as Error).message);
return NewsScheduler.emptyStats();
}
}
private async runPrWire(): Promise<IngestStats> {
try {
const stories = await this.prwire.poll();
const stats = this.pipeline.ingest(stories, this.universe.getUniverse());
if (stats.stored > 0) this.logger.log(`PR-wire: stored ${stats.stored}/${stats.fetched}`);
return stats;
} catch (err) {
this.logger.warn('PR-wire poll cycle failed:', (err as Error).message);
return NewsScheduler.emptyStats();
}
}
private runRetention(): void {
try {
const { bodiesPurged, rowsDeleted } = this.pipeline.runRetention();
this.logger.log(`News retention: ${bodiesPurged} bodies purged, ${rowsDeleted} rows deleted`);
} catch (err) {
this.logger.warn('News retention failed:', (err as Error).message);
}
}
private static emptyStats(): IngestStats {
return {
fetched: 0,
stored: 0,
droppedNoUniverseTicker: 0,
droppedNoise: 0,
droppedDuplicate: 0,
droppedCapped: 0,
};
}
}
+50
View File
@@ -0,0 +1,50 @@
import { DatabaseConnection } from '../shared/db/index';
import { QueryBuilder } from '../shared/utils/QueryBuilder';
/**
* The tracked-ticker universe (FREE-DATA-STACK §4.1):
* watchlist holdings tickers screened in the last 30 days.
*
* This is the news pipeline's first and biggest filter — stories about
* tickers outside the universe are never stored. Cached for 10 minutes;
* the universe changes slowly.
*/
export class UniverseProvider {
private static readonly CACHE_TTL_MS = 10 * 60 * 1000;
private static readonly SNAPSHOT_LOOKBACK_DAYS = 30;
private cache: { universe: Set<string>; expiresAt: number } = {
universe: new Set(),
expiresAt: 0,
};
constructor(private readonly db: DatabaseConnection) {}
getUniverse(): Set<string> {
if (Date.now() < this.cache.expiresAt) return this.cache.universe;
const sinceDay = new Date(
Date.now() - UniverseProvider.SNAPSHOT_LOOKBACK_DAYS * 24 * 60 * 60 * 1000,
)
.toISOString()
.slice(0, 10);
const tickers = new Set<string>();
const add = (rows: { ticker: string }[]) =>
rows.forEach((r) => tickers.add(r.ticker.toUpperCase()));
add(this.db.all(new QueryBuilder('UNIVERSE_QUERIES.DISTINCT_WATCHLIST_TICKERS')));
add(this.db.all(new QueryBuilder('UNIVERSE_QUERIES.DISTINCT_HOLDING_TICKERS')));
add(
this.db.all(new QueryBuilder('UNIVERSE_QUERIES.DISTINCT_SNAPSHOT_TICKERS_SINCE', [sinceDay])),
);
this.cache = { universe: tickers, expiresAt: Date.now() + UniverseProvider.CACHE_TTL_MS };
return tickers;
}
/** Force next getUniverse() to re-read (e.g. after a watchlist change). */
invalidate(): void {
this.cache.expiresAt = 0;
}
}
+10
View File
@@ -0,0 +1,10 @@
// News domain — free-tier news ingestion pipeline (FREE-DATA-STACK.md)
export { NewsController } from './news.controller';
export { NewsRepository } from './NewsRepository';
export { NewsPipeline } from './NewsPipeline';
export { UniverseProvider } from './UniverseProvider';
export { NewsScheduler } from './NewsScheduler';
export { EdgarPoller } from './pollers/EdgarPoller';
export { PrWirePoller } from './pollers/PrWirePoller';
export { RssParser } from './rss';
+90
View File
@@ -0,0 +1,90 @@
import type { FastifyInstance, FastifyRequest } from 'fastify';
import { NewsRepository } from './NewsRepository';
import { YahooFinanceClient } from '../shared';
import type { NewsArticleRow } from '../shared/types';
interface StoryView {
headline: string;
tickers: string[];
source: string;
catalyst: string | null;
url: string;
publishedAt: string;
}
/**
* Read side of the news pipeline. Stored pipeline stories (curated, catalyst-
* tagged, historical) are merged with a live per-ticker Yahoo search on
* request — stored gives depth, live gives freshness. The RSS firehoses
* can't be queried per-ticker on demand, which is why they go through the
* polling pipeline instead.
*/
export class NewsController {
constructor(
private readonly repo: NewsRepository,
private readonly yahoo?: YahooFinanceClient,
) {}
register(app: FastifyInstance): void {
app.get('/api/news/recent', this.recent.bind(this));
app.get('/api/news/:ticker', this.byTicker.bind(this));
}
/** GET /api/news/:ticker?days=7&live=1 (live Yahoo merge on by default) */
private async byTicker(req: FastifyRequest) {
const ticker = (req.params as { ticker: string }).ticker.toUpperCase();
const query = req.query as { days?: string; live?: string };
const days = Math.min(Number(query.days ?? 7) || 7, 90);
const live = query.live !== '0';
const sinceDay = new Date(Date.now() - days * 24 * 60 * 60 * 1000).toISOString().slice(0, 10);
const stored = this.repo.newsForTicker(ticker, sinceDay).map(NewsController.serialize);
const fresh = live ? await this.fetchLive(ticker) : [];
// Merge, dedupe by URL, newest first
const byUrl = new Map<string, StoryView>();
for (const s of [...stored, ...fresh]) byUrl.set(s.url, byUrl.get(s.url) ?? s);
const stories = [...byUrl.values()].sort((a, b) => b.publishedAt.localeCompare(a.publishedAt));
return { ticker, days, stories };
}
/** Live per-ticker Yahoo news search — freshness layer, best-effort. */
private async fetchLive(ticker: string): Promise<StoryView[]> {
if (!this.yahoo) return [];
try {
const items = await this.yahoo.search(ticker, { newsCount: 8 });
return items
.filter((n) => n.title && n.link)
.map((n) => ({
headline: n.title as string,
tickers: [ticker],
source: 'yahoo',
catalyst: null,
url: n.link as string,
publishedAt: n.providerPublishTime
? new Date(n.providerPublishTime).toISOString()
: new Date().toISOString(),
}));
} catch {
return [];
}
}
/** GET /api/news/recent?limit=50 */
private async recent(req: FastifyRequest) {
const limit = Math.min(Number((req.query as { limit?: string }).limit ?? 50) || 50, 200);
return { stories: this.repo.recent(limit).map(NewsController.serialize) };
}
private static serialize(row: NewsArticleRow) {
return {
headline: row.headline,
tickers: JSON.parse(row.ticker_list) as string[],
source: row.source,
catalyst: row.catalyst,
url: row.url,
publishedAt: row.published_at,
};
}
}
+122
View File
@@ -0,0 +1,122 @@
import { RssParser } from '../rss';
import type { CatalystType, Logger, NormalizedStory } from '../../shared/types';
/**
* SEC EDGAR poller (FREE-DATA-STACK §1.3 / P1.2 Tier 2). Free forever, and
* the highest-value source: filings frequently precede the headline.
*
* Strategy: poll the site-wide "current filings" atom feed once per form
* type (4 requests/cycle total, well inside SEC fair use), map filer CIK →
* ticker via the daily-cached company_tickers.json, and emit stories only
* for universe tickers. The pipeline applies its own universe filter again —
* defense in depth.
*
* SEC requires a descriptive User-Agent with contact info: set
* EDGAR_USER_AGENT in .env (e.g. "market-screener/1.0 you@example.com").
*/
export class EdgarPoller {
private static readonly TICKER_MAP_URL = 'https://www.sec.gov/files/company_tickers.json';
private static readonly TICKER_MAP_TTL_MS = 24 * 60 * 60 * 1000;
/** form type → catalyst classification (overrides keyword classify). */
private static readonly FORMS: Array<{ form: string; catalyst: CatalystType }> = [
{ form: '8-K', catalyst: 'regulatory' }, // material events
{ form: 'SC 13D', catalyst: 'ma' }, // activist stake >5% — classic pre-M&A tell
{ form: 'S-4', catalyst: 'ma' }, // merger registration
{ form: 'DEFM14A', catalyst: 'ma' }, // merger proxy
];
private cikToTicker: Map<string, string> = new Map();
private mapExpiresAt = 0;
constructor(
private readonly logger: Logger,
private readonly userAgent = process.env.EDGAR_USER_AGENT ??
'market-screener/1.0 (set EDGAR_USER_AGENT in .env)',
) {}
/** Fetch all form feeds and return normalized stories for universe tickers. */
async poll(universe: Set<string>): Promise<NormalizedStory[]> {
if (universe.size === 0) return [];
await this.refreshTickerMap();
const stories: NormalizedStory[] = [];
for (const { form, catalyst } of EdgarPoller.FORMS) {
try {
const xml = await this.fetchText(EdgarPoller.feedUrl(form));
stories.push(...this.parseFeed(xml, form, catalyst, universe));
} catch (err) {
this.logger.warn(`EDGAR ${form} feed failed:`, (err as Error).message);
}
}
return stories;
}
/** Parse one atom feed. Public for fixture tests. */
parseFeed(
xml: string,
form: string,
catalyst: CatalystType,
universe: Set<string>,
): NormalizedStory[] {
const stories: NormalizedStory[] = [];
for (const entry of RssParser.blocks(xml, 'entry')) {
const title = RssParser.tag(entry, 'title') ?? '';
const updated = RssParser.tag(entry, 'updated');
const url = RssParser.link(entry);
if (!title || !url || !updated) continue;
// Title format: "8-K - APPLE INC (0000320193) (Filer)"
const cikMatch = title.match(/\((\d{10})\)/);
if (!cikMatch) continue;
const ticker = this.cikToTicker.get(cikMatch[1]);
if (!ticker || !universe.has(ticker)) continue;
const company = title
.replace(/^[^-]+-\s*/, '')
.replace(/\(\d{10}\)/g, '')
.replace(/\((Filer|Subject|Reporting)\)/gi, '')
.trim();
stories.push({
tickers: [ticker],
headline: `${form} filing: ${company}`,
body: null,
source: 'edgar',
url,
publishedAt: new Date(updated).toISOString(),
catalystHint: catalyst,
});
}
return stories;
}
/** Inject a CIK→ticker map directly (tests). CIKs are 10-digit zero-padded. */
setTickerMap(map: Map<string, string>): void {
this.cikToTicker = map;
this.mapExpiresAt = Date.now() + EdgarPoller.TICKER_MAP_TTL_MS;
}
private async refreshTickerMap(): Promise<void> {
if (Date.now() < this.mapExpiresAt && this.cikToTicker.size > 0) return;
const raw = await this.fetchText(EdgarPoller.TICKER_MAP_URL);
const data = JSON.parse(raw) as Record<string, { cik_str: number; ticker: string }>;
const map = new Map<string, string>();
for (const entry of Object.values(data)) {
map.set(String(entry.cik_str).padStart(10, '0'), entry.ticker.toUpperCase());
}
this.setTickerMap(map);
this.logger.log(`EDGAR ticker map refreshed: ${map.size} companies`);
}
private static feedUrl(form: string): string {
const type = encodeURIComponent(form);
return `https://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent&type=${type}&company=&dateb=&owner=include&count=100&output=atom`;
}
private async fetchText(url: string): Promise<string> {
const res = await fetch(url, { headers: { 'User-Agent': this.userAgent } });
if (!res.ok) throw new Error(`HTTP ${res.status} for ${url}`);
return res.text();
}
}
@@ -0,0 +1,91 @@
import { RssParser } from '../rss';
import type { Logger, NormalizedStory } from '../../shared/types';
/**
* PR-wire RSS poller (FREE-DATA-STACK §1.4 / P1.2 Tier 3) — press releases
* that the other free feeds miss, mostly small-caps.
*
* Ticker extraction relies on the wire convention of exchange tags in the
* text: "(NYSE: ABC)", "(Nasdaq: XYZ)". Stories without an exchange tag
* produce no tickers and are dropped by the pipeline's universe filter —
* that's intentional; untagged wire stories are rarely decision-grade.
*
* Feed list is overridable: NEWS_PRWIRE_FEEDS="url1,url2" in .env
* (wire RSS URLs change occasionally — if a feed 404s, update the env var).
*/
export class PrWirePoller {
private static readonly DEFAULT_FEEDS = [
// GlobeNewswire — public-company news
'https://www.globenewswire.com/RssFeed/orgclass/1/feedTitle/GlobeNewswire%20-%20News%20about%20Public%20Companies',
// PR Newswire — all news releases
'https://www.prnewswire.com/rss/news-releases-list.rss',
];
private static readonly EXCHANGE_TAG =
/\((?:NYSE(?:\s+American)?|NASDAQ|Nasdaq|AMEX|CBOE|OTC(?:QB|QX|MKTS)?)\s*:\s*([A-Za-z][A-Za-z.]{0,5})\)/g;
private readonly feeds: string[];
constructor(
private readonly logger: Logger,
feeds?: string[],
) {
const env = process.env.NEWS_PRWIRE_FEEDS;
this.feeds = feeds ?? (env ? env.split(',').map((s) => s.trim()) : PrWirePoller.DEFAULT_FEEDS);
}
async poll(): Promise<NormalizedStory[]> {
const stories: NormalizedStory[] = [];
for (const feed of this.feeds) {
try {
const xml = await this.fetchText(feed);
stories.push(...PrWirePoller.parseFeed(xml));
} catch (err) {
this.logger.warn(`PR-wire feed failed (${feed}):`, (err as Error).message);
}
}
return stories;
}
/** Parse one RSS feed. Public static for fixture tests. */
static parseFeed(xml: string): NormalizedStory[] {
const stories: NormalizedStory[] = [];
for (const item of RssParser.blocks(xml, 'item')) {
const title = RssParser.tag(item, 'title');
const url = RssParser.link(item);
const pubDate = RssParser.tag(item, 'pubDate');
if (!title || !url) continue;
const description = RssParser.tag(item, 'description') ?? '';
const tickers = PrWirePoller.extractTickers(`${title} ${description}`);
if (tickers.length === 0) continue; // no exchange tag → skip early
stories.push({
tickers,
headline: title,
body: description || null,
source: 'prwire',
url,
publishedAt: pubDate ? new Date(pubDate).toISOString() : new Date().toISOString(),
});
}
return stories;
}
/** "(NYSE: ABC)" / "(Nasdaq: XYZ)" → ['ABC', 'XYZ']. Public for tests. */
static extractTickers(text: string): string[] {
const out = new Set<string>();
for (const m of text.matchAll(PrWirePoller.EXCHANGE_TAG)) {
out.add(m[1].toUpperCase());
}
return [...out];
}
private async fetchText(url: string): Promise<string> {
const res = await fetch(url, {
headers: { 'User-Agent': 'market-screener/1.0 (+rss reader)' },
});
if (!res.ok) throw new Error(`HTTP ${res.status}`);
return res.text();
}
}
+43
View File
@@ -0,0 +1,43 @@
/**
* Minimal RSS/Atom extraction — enough for EDGAR atom feeds and PR-wire RSS.
* Deliberately dependency-free; if a feed outgrows this, swap in
* fast-xml-parser without touching the pollers' output shape.
*/
export class RssParser {
/** Extract raw <item>…</item> or <entry>…</entry> blocks. */
static blocks(xml: string, tag: 'item' | 'entry'): string[] {
const re = new RegExp(`<${tag}[\\s>][\\s\\S]*?<\\/${tag}>`, 'g');
return xml.match(re) ?? [];
}
/** First occurrence of a simple tag's text content, entity-decoded. */
static tag(block: string, name: string): string | null {
const re = new RegExp(`<${name}[^>]*>([\\s\\S]*?)<\\/${name}>`, 'i');
const m = block.match(re);
return m ? RssParser.clean(m[1]) : null;
}
/** Atom-style <link href="…"/> (self-closing) or RSS <link>…</link>. */
static link(block: string): string | null {
const href = block.match(/<link[^>]*href="([^"]+)"/i);
if (href) return RssParser.decode(href[1].trim());
return RssParser.tag(block, 'link');
}
private static clean(raw: string): string {
const noCdata = raw.replace(/<!\[CDATA\[([\s\S]*?)\]\]>/g, '$1');
const noTags = noCdata.replace(/<[^>]+>/g, ' ');
return RssParser.decode(noTags).replace(/\s+/g, ' ').trim();
}
private static decode(s: string): string {
return s
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#0?39;/g, "'")
.replace(/&apos;/g, "'")
.replace(/&#(\d+);/g, (_, n) => String.fromCharCode(Number(n)));
}
}