When setting up a FeedConnection, check if the feed loads and parses rather than just checking the Content-Type (#684)

* When setting up a FeedConnection, check if the feed loads and parses rather than just checking the Content-Type

* Don't create a new Parser each time we fetch a feed

* Create 684.bugfix

---------

Co-authored-by: Tadeusz Sośnierz <tadeusz@sosnierz.com>
Co-authored-by: Will Hunt <will@half-shot.uk>
This commit is contained in:
Tadeusz Sośnierz 2023-03-24 15:22:10 +01:00 committed by GitHub
parent 5ccd64acd8
commit 6c4bbc7150
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 43 additions and 30 deletions

1
changelog.d/684.bugfix Normal file
View File

@ -0,0 +1 @@
Don't check Content-Type of RSS feeds when adding a new connection, instead just check if the feed is valid.

View File

@ -2,7 +2,7 @@ import {Appservice, Intent, StateEvent} from "matrix-bot-sdk";
import { IConnection, IConnectionState, InstantiateConnectionOpts } from ".";
import { ApiError, ErrCode } from "../api";
import { BridgeConfigFeeds } from "../Config/Config";
import { FeedEntry, FeedError} from "../feeds/FeedReader";
import { FeedEntry, FeedError, FeedReader} from "../feeds/FeedReader";
import { Logger } from "matrix-appservice-bridge";
import { IBridgeStorageProvider } from "../Stores/StorageProvider";
import { BaseConnection } from "./BaseConnection";
@ -37,6 +37,7 @@ export interface FeedConnectionSecrets {
export type FeedResponseItem = GetConnectionsResponseItem<FeedConnectionState, FeedConnectionSecrets>;
const MAX_LAST_RESULT_ITEMS = 5;
const VALIDATION_FETCH_TIMEOUT_MS = 5000;
@Connection
export class FeedConnection extends BaseConnection implements IConnection {
@ -57,20 +58,11 @@ export class FeedConnection extends BaseConnection implements IConnection {
} catch (ex) {
throw new ApiError("Feed URL doesn't appear valid", ErrCode.BadValue);
}
let res;
try {
res = await axios.head(url).catch(() => axios.get(url));
await FeedReader.fetchFeed(url, {}, VALIDATION_FETCH_TIMEOUT_MS);
} catch (ex) {
throw new ApiError(`Could not read from URL: ${ex.message}`, ErrCode.BadValue);
}
const contentType = res.headers['content-type'];
// we're deliberately liberal here, since different things pop up in the wild
if (!contentType.match(/xml/)) {
throw new ApiError(
`Feed responded with a content type of "${contentType}", which doesn't look like an RSS/Atom feed`,
ErrCode.BadValue,
StatusCodes.UNSUPPORTED_MEDIA_TYPE
);
throw new ApiError(`Could not read feed from URL: ${ex.message}`, ErrCode.BadValue);
}
}

View File

@ -6,7 +6,7 @@ import { Logger } from "matrix-appservice-bridge";
import { MessageQueue } from "../MessageQueue";
import Ajv from "ajv";
import axios from "axios";
import axios, { AxiosResponse } from "axios";
import Parser from "rss-parser";
import Metrics from "../Metrics";
import UserAgent from "../UserAgent";
@ -88,8 +88,7 @@ function normalizeUrl(input: string): string {
}
export class FeedReader {
private readonly parser = new Parser();
private readonly parser = FeedReader.buildParser();
private connections: FeedConnection[];
// ts should notice that we do in fact initialize it in constructor, but it doesn't (in this version)
@ -185,6 +184,28 @@ export class FeedReader {
await this.matrixClient.setAccountData(FeedReader.seenEntriesEventType, accountData);
}
private static buildParser(): Parser {
return new Parser();
}
public static async fetchFeed(
url: string,
headers: any,
timeoutMs: number,
parser: Parser = FeedReader.buildParser(),
): Promise<{ response: AxiosResponse<any, any>, feed: Parser.Output<any> }> {
const response = await axios.get(url, {
headers: {
'User-Agent': UserAgent,
...headers,
},
// We don't want to wait forever for the feed.
timeout: timeoutMs,
});
const feed = await parser.parseString(response.data);
return { response, feed };
}
private async pollFeeds(): Promise<void> {
log.debug(`Checking for updates in ${this.observedFeedUrls.size} RSS/Atom feeds`);
@ -196,28 +217,24 @@ export class FeedReader {
const fetchKey = randomUUID();
const { etag, lastModified } = this.cacheTimes.get(url) || {};
try {
const res = await axios.get(url, {
headers: {
'User-Agent': UserAgent,
const { response, feed } = await FeedReader.fetchFeed(
url,
{
...(lastModified && { 'If-Modified-Since': lastModified}),
...(etag && { 'If-None-Match': etag}),
},
// We don't want to wait forever for the feed.
timeout: this.config.pollTimeoutSeconds * 1000,
});
// Clear any HTTP failures
this.feedsFailingHttp.delete(url);
this.config.pollTimeoutSeconds * 1000,
this.parser,
);
// Store any entity tags/cache times.
if (res.headers.ETag) {
this.cacheTimes.set(url, { etag: res.headers.ETag});
} else if (res.headers['Last-Modified']) {
this.cacheTimes.set(url, { lastModified: res.headers['Last-Modified'] });
if (response.headers.ETag) {
this.cacheTimes.set(url, { etag: response.headers.ETag});
} else if (response.headers['Last-Modified']) {
this.cacheTimes.set(url, { lastModified: response.headers['Last-Modified'] });
}
const feed = await this.parser.parseString(res.data);
this.feedsFailingParsing.delete(url);
let initialSync = false;
let seenGuids = this.seenEntries.get(url);
if (!seenGuids) {
@ -272,6 +289,9 @@ export class FeedReader {
this.seenEntries.set(url, newSeenItems);
}
this.queue.push<FeedSuccess>({ eventName: 'feed.success', sender: 'FeedReader', data: { url: url } });
// Clear any feed failures
this.feedsFailingHttp.delete(url);
this.feedsFailingParsing.delete(url);
} catch (err: unknown) {
if (axios.isAxiosError(err)) {
// No new feed items, skip.