Staggered RSS feed polling (#685)

* Ensure we poll feeds equally

* Ensure we poll at least once

* Create 685.misc

* Tidy up
This commit is contained in:
Will Hunt 2023-03-28 11:13:57 +01:00 committed by GitHub
parent 507b014bd5
commit 44eea7f7c3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 157 additions and 120 deletions

1
changelog.d/685.misc Normal file
View File

@ -0,0 +1 @@
Stagger RSS feed polling over the interval period, rather than attempting to poll all feeds at once. Should reduce memory / CPU spikes.

View File

@ -6,11 +6,9 @@ import { FeedEntry, FeedError, FeedReader} from "../feeds/FeedReader";
import { Logger } from "matrix-appservice-bridge";
import { IBridgeStorageProvider } from "../Stores/StorageProvider";
import { BaseConnection } from "./BaseConnection";
import axios from "axios";
import markdown from "markdown-it";
import { Connection, ProvisionConnectionOpts } from "./IConnection";
import { GetConnectionsResponseItem } from "../provisioning/api";
import { StatusCodes } from "http-status-codes";
const log = new Logger("FeedConnection");
const md = new markdown();

View File

@ -1,4 +1,4 @@
import { MatrixClient } from "matrix-bot-sdk";
import { MatrixClient, MatrixError } from "matrix-bot-sdk";
import { BridgeConfigFeeds } from "../Config/Config";
import { ConnectionManager } from "../ConnectionManager";
import { FeedConnection } from "../Connections";
@ -88,12 +88,29 @@ function normalizeUrl(input: string): string {
return url.toString();
}
function shuffle<T>(array: T[]): T[] {
for (let i = array.length - 1; i > 0; i--) {
const j = Math.floor(Math.random() * (i + 1));
[array[i], array[j]] = [array[j], array[i]];
}
return array;
}
interface FeedItem {
title?: string;
link?: string;
id?: string;
}
export class FeedReader {
private readonly parser = FeedReader.buildParser();
private connections: FeedConnection[];
// ts should notice that we do in fact initialize it in constructor, but it doesn't (in this version)
private observedFeedUrls: Set<string> = new Set();
private feedQueue: string[] = [];
private seenEntries: Map<string, string[]> = new Map();
// A set of last modified times for each url.
private cacheTimes: Map<string, { etag?: string, lastModified?: string}> = new Map();
@ -107,6 +124,10 @@ export class FeedReader {
private shouldRun = true;
private timeout?: NodeJS.Timeout;
get sleepingInterval() {
return (this.config.pollIntervalSeconds * 1000) / (this.feedQueue.length || 1);
}
constructor(
private readonly config: BridgeConfigFeeds,
private readonly connectionManager: ConnectionManager,
@ -152,20 +173,22 @@ export class FeedReader {
}
}
this.observedFeedUrls = new Set(normalizedUrls);
this.feedQueue = shuffle([...this.observedFeedUrls.values()]);
Metrics.feedsCount.set(this.observedFeedUrls.size);
}
private async loadSeenEntries(): Promise<void> {
try {
const accountData = await this.matrixClient.getAccountData<any>(FeedReader.seenEntriesEventType).catch((err: any) => {
if (err.statusCode === 404) {
return {};
const accountData = await this.matrixClient.getAccountData<AccountData>(FeedReader.seenEntriesEventType).catch((err: MatrixError|unknown) => {
if (err instanceof MatrixError && err.statusCode === 404) {
return {} as AccountData;
} else {
throw err;
}
});
if (!validateAccountData(accountData)) {
const errors = validateAccountData.errors!.map(e => `${e.instancePath} ${e.message}`);
const errors = validateAccountData.errors?.map(e => `${e.instancePath} ${e.message}`) || ['No error reported'];
throw new Error(`Invalid account data: ${errors.join(', ')}`);
}
for (const url in accountData) {
@ -191,10 +214,10 @@ export class FeedReader {
public static async fetchFeed(
url: string,
headers: any,
headers: Record<string, string>,
timeoutMs: number,
parser: Parser = FeedReader.buildParser(),
): Promise<{ response: AxiosResponse<any, any>, feed: Parser.Output<any> }> {
): Promise<{ response: AxiosResponse, feed: Parser.Output<FeedItem> }> {
const response = await axios.get(url, {
headers: {
'User-Agent': UserAgent,
@ -207,16 +230,19 @@ export class FeedReader {
return { response, feed };
}
private async pollFeeds(): Promise<void> {
log.debug(`Checking for updates in ${this.observedFeedUrls.size} RSS/Atom feeds`);
/**
* Poll a given feed URL for data, pushing any entries found into the message queue.
* We also check the `cacheTimes` cache to see if the feed has recent entries that we can
* filter out.
*
* @param url The URL to be polled.
* @returns A boolean that returns if we saw any changes on the feed since the last poll time.
*/
private async pollFeed(url: string): Promise<boolean> {
let seenEntriesChanged = false;
const fetchingStarted = Date.now();
for (const url of this.observedFeedUrls.values()) {
const fetchKey = randomUUID();
const { etag, lastModified } = this.cacheTimes.get(url) || {};
log.debug(`Checking for updates in ${url} (${etag ?? lastModified})`);
try {
const { response, feed } = await FeedReader.fetchFeed(
url,
@ -250,6 +276,7 @@ export class FeedReader {
const seenGuidsSet = new Set(seenGuids);
const newGuids = [];
log.debug(`Found ${feed.items.length} entries in ${url}`);
for (const item of feed.items) {
// Find the first guid-like that looks like a string.
// Some feeds have a nasty habit of leading a empty tag there, making us parse it as garbage.
@ -304,7 +331,7 @@ export class FeedReader {
if (axios.isAxiosError(err)) {
// No new feed items, skip.
if (err.response?.status === StatusCodes.NOT_MODIFIED) {
continue;
return false;
}
this.feedsFailingHttp.add(url);
} else {
@ -314,26 +341,37 @@ export class FeedReader {
const feedError = new FeedError(url.toString(), error, fetchKey);
log.error("Unable to read feed:", feedError.message);
this.queue.push<FeedError>({ eventName: 'feed.error', sender: 'FeedReader', data: feedError});
} finally {
this.feedQueue.push(url);
}
return seenEntriesChanged;
}
private async pollFeeds(): Promise<void> {
log.debug(`Checking for updates in ${this.observedFeedUrls.size} RSS/Atom feeds`);
const fetchingStarted = Date.now();
const [ url ] = this.feedQueue.splice(0, 1);
if (url) {
if (await this.pollFeed(url)) {
await this.saveSeenEntries();
}
}
Metrics.feedsFailing.set({ reason: "http" }, this.feedsFailingHttp.size );
Metrics.feedsFailing.set({ reason: "parsing" }, this.feedsFailingParsing.size);
if (seenEntriesChanged) await this.saveSeenEntries();
const elapsed = Date.now() - fetchingStarted;
Metrics.feedFetchMs.set(elapsed);
let sleepFor: number;
if (elapsed > this.config.pollIntervalSeconds * 1000) {
log.warn(`It took us longer to update the feeds than the configured pool interval (${elapsed / 1000}s)`);
sleepFor = 0;
} else {
sleepFor = this.config.pollIntervalSeconds * 1000 - elapsed;
const sleepFor = Math.min(this.sleepingInterval - elapsed, 0);
log.debug(`Feed fetching took ${elapsed / 1000}s, sleeping for ${sleepFor / 1000}s`);
}
if (elapsed > this.sleepingInterval) {
log.warn(`It took us longer to update the feeds than the configured pool interval`);
}
this.timeout = setTimeout(() => {
if (!this.shouldRun) {