From 43176adf7a149fb93a4d461a3a00d8201d156e98 Mon Sep 17 00:00:00 2001 From: Will Hunt Date: Tue, 25 Apr 2023 16:45:55 +0100 Subject: [PATCH] Fallback to parsing feeds as atom format if rss format fails. (#721) * Support atom feeds in rust parser * Add an apply linting command * Add changelog * Fixup * Add tests for atom feeds + remove redundant code * Remove unused rss-parser * Tests for all formats. * Move hashing logic into rust to save cross-context calls * lint my rust * Use a String::from * Ensure guids are not repeated --- Cargo.lock | 1 + Cargo.toml | 1 + changelog.d/721.bugfix | 1 + package.json | 2 +- src/feeds/FeedReader.ts | 62 ++++--------- src/feeds/parser.rs | 167 +++++++++++++++++++++++++--------- tests/FeedReader.spec.ts | 189 ++++++++++++++++++++++++++++++++++----- yarn.lock | 18 +--- 8 files changed, 317 insertions(+), 124 deletions(-) create mode 100644 changelog.d/721.bugfix diff --git a/Cargo.lock b/Cargo.lock index 456c4f5f..2b26a346 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -279,6 +279,7 @@ dependencies = [ name = "matrix-hookshot" version = "1.8.1" dependencies = [ + "atom_syndication", "contrast", "hex", "md-5", diff --git a/Cargo.toml b/Cargo.toml index 9a4af331..7260e560 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,7 @@ rgb = "0" md-5 = "0.8.0" hex = "0.4.3" rss = "2.0.3" +atom_syndication = "0.12" [build-dependencies] napi-build = "1" diff --git a/changelog.d/721.bugfix b/changelog.d/721.bugfix new file mode 100644 index 00000000..4a4492f3 --- /dev/null +++ b/changelog.d/721.bugfix @@ -0,0 +1 @@ +Switch to using Rust for parsing RSS feeds. diff --git a/package.json b/package.json index 22980ad0..91052d0a 100644 --- a/package.json +++ b/package.json @@ -36,6 +36,7 @@ "lint": "yarn run lint:js && yarn run lint:rs", "lint:js": "eslint -c .eslintrc.js 'src/**/*.ts' 'tests/**/*.ts' 'web/**/*.ts' 'web/**/*.tsx'", "lint:rs": "cargo fmt --all -- --check", + "lint:rs:apply": "cargo fmt --all", "generate-default-config": "ts-node src/Config/Defaults.ts --config > config.sample.yml", "validate-config": "ts-node src/Config/Config.ts" }, @@ -64,7 +65,6 @@ "p-queue": "^6.6.2", "prom-client": "^14.0.1", "reflect-metadata": "^0.1.13", - "rss-parser": "^3.12.0", "source-map-support": "^0.5.21", "string-argv": "^0.3.1", "tiny-typed-emitter": "^2.1.0", diff --git a/src/feeds/FeedReader.ts b/src/feeds/FeedReader.ts index fff62031..cc671112 100644 --- a/src/feeds/FeedReader.ts +++ b/src/feeds/FeedReader.ts @@ -7,13 +7,12 @@ import { MessageQueue } from "../MessageQueue"; import Ajv from "ajv"; import axios, { AxiosResponse } from "axios"; -import Parser from "rss-parser"; import Metrics from "../Metrics"; import UserAgent from "../UserAgent"; import { randomUUID } from "crypto"; import { StatusCodes } from "http-status-codes"; import { FormatUtil } from "../FormatUtil"; -import { FeedItem, parseRSSFeed } from "../libRs"; +import { JsRssChannel, parseFeed } from "../libRs"; const log = new Logger("FeedReader"); @@ -111,10 +110,6 @@ function shuffle(array: T[]): T[] { export class FeedReader { - private static buildParser(): Parser { - return new Parser(); - } - /** * Read a feed URL and parse it into a set of items. * @param url The feed URL. @@ -128,7 +123,7 @@ export class FeedReader { headers: Record, timeoutMs: number, httpClient = axios, - ): Promise<{ response: AxiosResponse, feed: Parser.Output }> { + ): Promise<{ response: AxiosResponse, feed: JsRssChannel }> { const response = await httpClient.get(url, { headers: { 'User-Agent': UserAgent, @@ -141,34 +136,9 @@ export class FeedReader { if (typeof response.data !== "string") { throw Error('Unexpected response type'); } - const feed = parseRSSFeed(response.data); + const feed = parseFeed(response.data); return { response, feed }; } - - /** - * Attempt to parse a link from a feed item. - * @param item A feed item. - * @returns Return either a link to the item, or null. - */ - private static parseLinkFromItem(item: FeedItem) { - if (item.link) { - return item.link; - } - if (item.id && item.idIsPermalink) { - try { - // The feed librray doesn't give us attributes (needs isPermaLink), so we're not really sure if this a URL or not. - // Parse it and see. - // https://validator.w3.org/feed/docs/rss2.html#ltguidgtSubelementOfLtitemgt - const url = new URL(item.id); - return url.toString(); - } catch (ex) { - return null; - } - } - return null; - } - - private readonly parser = FeedReader.buildParser(); private connections: FeedConnection[]; // ts should notice that we do in fact initialize it in constructor, but it doesn't (in this version) @@ -283,7 +253,7 @@ export class FeedReader { * @param url The URL to be polled. * @returns A boolean that returns if we saw any changes on the feed since the last poll time. */ - private async pollFeed(url: string): Promise { + public async pollFeed(url: string): Promise { let seenEntriesChanged = false; const fetchKey = randomUUID(); const { etag, lastModified } = this.cacheTimes.get(url) || {}; @@ -317,7 +287,6 @@ export class FeedReader { // migrate legacy, cleartext guids to their md5-hashed counterparts seenGuids = seenGuids.map(guid => guid.startsWith('md5:') ? guid : this.hashGuid(guid)); - const seenGuidsSet = new Set(seenGuids); const newGuids = []; log.debug(`Found ${feed.items.length} entries in ${url}`); @@ -325,33 +294,31 @@ export class FeedReader { for (const item of feed.items) { // Find the first guid-like that looks like a string. // Some feeds have a nasty habit of leading a empty tag there, making us parse it as garbage. - const guid = [item.id, item.link, item.title].find(isNonEmptyString); - if (!guid) { + if (!item.hashId) { log.error(`Could not determine guid for entry in ${url}, skipping`); continue; } - const hashedGuid = this.hashGuid(guid); - newGuids.push(hashedGuid); + const hashId = `md5:${item.hashId}`; + newGuids.push(hashId); if (initialSync) { - log.debug(`Skipping entry ${guid} since we're performing an initial sync`); + log.debug(`Skipping entry ${item.id ?? hashId} since we're performing an initial sync`); continue; } - if (seenGuidsSet.has(hashedGuid)) { - log.debug('Skipping already seen entry', guid); + if (seenGuidsSet.has(hashId)) { + log.debug('Skipping already seen entry', item.id ?? hashId); continue; } - const entry = { feed: { title: isNonEmptyString(feed.title) ? stripHtml(feed.title) : null, url: url, }, title: isNonEmptyString(item.title) ? stripHtml(item.title) : null, - pubdate: item.pubDate ?? null, + pubdate: item.pubdate ?? null, summary: item.summary ?? null, author: item.author ?? null, - link: FeedReader.parseLinkFromItem(item), + link: item.link ?? null, fetchKey }; @@ -395,7 +362,10 @@ export class FeedReader { return seenEntriesChanged; } - private async pollFeeds(): Promise { + /** + * Start polling all the feeds. + */ + public async pollFeeds(): Promise { log.debug(`Checking for updates in ${this.observedFeedUrls.size} RSS/Atom feeds`); const fetchingStarted = Date.now(); diff --git a/src/feeds/parser.rs b/src/feeds/parser.rs index 671fa233..b9e01d00 100644 --- a/src/feeds/parser.rs +++ b/src/feeds/parser.rs @@ -1,8 +1,11 @@ use std::str::FromStr; +use atom_syndication::{Error as AtomError, Feed, Person}; use napi::bindgen_prelude::{Error as JsError, Status}; use rss::{Channel, Error as RssError}; +use crate::format_util::hash_id; + #[derive(Serialize, Debug, Deserialize)] #[napi(object)] pub struct FeedItem { @@ -13,6 +16,7 @@ pub struct FeedItem { pub pubdate: Option, pub summary: Option, pub author: Option, + pub hash_id: Option, } #[derive(Serialize, Debug, Deserialize)] @@ -21,45 +25,128 @@ pub struct JsRssChannel { pub title: String, pub items: Vec, } -#[napi(js_name = "parseRSSFeed")] -pub fn js_parse_rss_feed(xml: String) -> Result { - fn map_item_value(original: &str) -> String { - original.to_string() - } - Channel::from_str(&xml) - .map(|channel| JsRssChannel { - title: channel.title().to_string(), - items: channel - .items() - .iter() - .map(|item| FeedItem { - title: item.title().map(map_item_value), - link: item.link().map(map_item_value), - id: item.guid().map(|f| f.value().to_string()), - id_is_permalink: item.guid().map_or(false, |f| f.is_permalink()), - pubdate: item.pub_date().map(map_item_value), - summary: item.description().map(map_item_value), - author: item.author().map(map_item_value), - }) - .collect(), - }) - .map_err(|op| match op { - RssError::Utf8(err) => JsError::new( - Status::Unknown, - format!("An error while converting bytes to UTF8. {}'", err).to_string(), - ), - RssError::Xml(err) => JsError::new( - Status::Unknown, - format!("XML parsing error. {}", err).to_string(), - ), - RssError::InvalidStartTag => JsError::new( - Status::Unknown, - format!("The input didn't begin with an opening tag.").to_string(), - ), - err => JsError::new( - Status::Unknown, - format!("Unknown error trying to parse feed parse feed '{}'", err).to_string(), - ), - }) +fn parse_channel_to_js_result(channel: &Channel) -> JsRssChannel { + JsRssChannel { + title: channel.title().to_string(), + items: channel + .items() + .iter() + .map(|item: &rss::Item| FeedItem { + title: item.title().map(String::from), + link: item.link().and_then(|v| Some(v.to_string())).or_else(|| { + item.guid() + .and_then(|i| i.permalink.then(|| i.value.to_string())) + }), + id: item.guid().map(|f| f.value().to_string()), + id_is_permalink: item.guid().map_or(false, |f| f.is_permalink()), + pubdate: item.pub_date().map(String::from), + summary: item.description().map(String::from), + author: item.author().map(String::from), + hash_id: item + .guid + .clone() + .and_then(|f| Some(f.value)) + .or(item.link.clone()) + .or(item.title.clone()) + .and_then(|f| hash_id(f).ok()), + }) + .collect(), + } +} + +fn parse_feed_to_js_result(feed: &Feed) -> JsRssChannel { + fn authors_to_string(persons: &[Person]) -> Option { + if persons.len() == 0 { + return None; + } + let mut outs = Vec::::new(); + for person in persons { + let email = person + .email + .clone() + .map_or_else(|| String::new(), |v| format!("<{}>", v)); + let uri = person + .uri + .clone() + .map_or_else(|| String::new(), |v| format!("<{}>", v)); + outs.push(format!("{}{}{}", person.name, email, uri)) + } + Some(outs.join(", ")) + } + JsRssChannel { + title: feed.title().to_string(), + items: feed + .entries() + .iter() + .map(|item| FeedItem { + title: Some(item.title().value.clone()), + link: item.links().first().map(|f| f.href.clone()), + id: Some(item.id.clone()), + // No equivalent + id_is_permalink: false, + pubdate: item + .published + .or(Some(item.updated)) + .map(|date| date.to_rfc2822()), + summary: item.summary().map(|v| v.value.clone()), + author: authors_to_string(item.authors()), + hash_id: hash_id(item.id.clone()).ok(), + }) + .collect(), + } +} + +#[napi(js_name = "parseFeed")] +pub fn js_parse_feed(xml: String) -> Result { + match Channel::from_str(&xml) { + Ok(channel) => Ok(parse_channel_to_js_result(&channel)), + Err(RssError::InvalidStartTag) => + // If the tag is wrong, parse again as a feed. + { + match Feed::from_str(&xml) { + Ok(feed) => Ok(parse_feed_to_js_result(&feed)), + Err(AtomError::Eof) => Err(JsError::new( + Status::Unknown, + format!("Unexpected end of input.").to_string(), + )), + Err(AtomError::InvalidStartTag) => Err(JsError::new( + Status::Unknown, + format!("An error while converting bytes to UTF8.").to_string(), + )), + Err(AtomError::WrongAttribute { attribute, value }) => Err(JsError::new( + Status::Unknown, + format!( + "The attribute '{}' had the wrong value '{}'", + attribute, value + ) + .to_string(), + )), + Err(AtomError::WrongDatetime(value)) => Err(JsError::new( + Status::Unknown, + format!("The format of the datetime ('{}') was wrong.", value).to_string(), + )), + Err(AtomError::Xml(err)) => Err(JsError::new( + Status::Unknown, + format!("XML parsing error . {}'", err).to_string(), + )), + Err(err) => Err(JsError::new( + Status::Unknown, + format!("Unknown error trying to parse feed parse feed '{}'", err).to_string(), + )), + } + } + Err(RssError::Utf8(err)) => Err(JsError::new( + Status::Unknown, + format!("An error while converting bytes to UTF8. {}'", err).to_string(), + )), + Err(RssError::Xml(err)) => Err(JsError::new( + Status::Unknown, + format!("XML parsing error. {}", err).to_string(), + )), + Err(RssError::Eof) => Err(JsError::new( + Status::Unknown, + format!("Unexpected end of input").to_string(), + )), + } } diff --git a/tests/FeedReader.spec.ts b/tests/FeedReader.spec.ts index 153be2fd..2cd323ce 100644 --- a/tests/FeedReader.spec.ts +++ b/tests/FeedReader.spec.ts @@ -4,7 +4,7 @@ import EventEmitter from "events"; import { BridgeConfigFeeds } from "../src/Config/Config"; import { ConnectionManager } from "../src/ConnectionManager"; import { IConnection } from "../src/Connections"; -import { FeedReader } from "../src/feeds/FeedReader"; +import { FeedEntry, FeedReader } from "../src/feeds/FeedReader"; import { MessageQueue, MessageQueueMessage } from "../src/MessageQueue"; class MockConnectionManager extends EventEmitter { @@ -45,17 +45,31 @@ class MockHttpClient { } } +const FEED_URL = 'http://test/'; + +function constructFeedReader(feedResponse: () => {headers: Record, data: string}) { + const config = new BridgeConfigFeeds({ + enabled: true, + pollIntervalSeconds: 1, + pollTimeoutSeconds: 1, + }); + const cm = new MockConnectionManager([{ feedUrl: FEED_URL } as unknown as IConnection]) as unknown as ConnectionManager + const mq = new MockMessageQueue(); + const feedReader = new FeedReader( + config, cm, mq, + { + getAccountData: () => Promise.resolve({ [FEED_URL]: [] } as unknown as T), + setAccountData: () => Promise.resolve(), + }, + new MockHttpClient({ ...feedResponse() } as AxiosResponse) as unknown as AxiosStatic, + ); + return {config, cm, mq, feedReader}; +} + describe("FeedReader", () => { it("should correctly handle empty titles", async () => { - const config = new BridgeConfigFeeds({ - enabled: true, - pollIntervalSeconds: 1, - pollTimeoutSeconds: 1, - }); - const cm = new MockConnectionManager([{ feedUrl: 'http://test/' } as unknown as IConnection]) as unknown as ConnectionManager - const mq = new MockMessageQueue(); - - const feedContents = ` + const { mq, feedReader} = constructFeedReader(() => ({ + headers: {}, data: ` test feedhttp://test/ @@ -67,16 +81,8 @@ describe("FeedReader", () => { Wed, 12 Apr 2023 09:53:00 GMT - `; - - const feedReader = new FeedReader( - config, cm, mq, - { - getAccountData: () => Promise.resolve({ 'http://test/': [] } as unknown as T), - setAccountData: () => Promise.resolve(), - }, - new MockHttpClient({ headers: {}, data: feedContents } as AxiosResponse) as unknown as AxiosStatic, - ); + ` + })); const event: any = await new Promise((resolve) => { mq.on('pushed', (data) => { resolve(data); feedReader.stop() }); @@ -86,4 +92,147 @@ describe("FeedReader", () => { expect(event.data.feed.title).to.equal(null); expect(event.data.title).to.equal(null); }); + it("should handle RSS 2.0 feeds", async () => { + const { mq, feedReader} = constructFeedReader(() => ({ + headers: {}, data: ` + + + + RSS Title + This is an example of an RSS feed + http://www.example.com/main.html + 2020 Example.com All rights reserved + Mon, 6 Sep 2010 00:01:00 +0000 + Sun, 6 Sep 2009 16:20:00 +0000 + 1800 + + Example entry + John Doe + Here is some text containing an interesting description. + http://www.example.com/blog/post/1 + 7bd204c6-1655-4c27-aeee-53f933c5395f + Sun, 6 Sep 2009 16:20:00 +0000 + + + + ` + })); + + const event: MessageQueueMessage = await new Promise((resolve) => { + mq.on('pushed', (data) => { resolve(data); feedReader.stop() }); + }); + + expect(event.eventName).to.equal('feed.entry'); + expect(event.data.feed.title).to.equal('RSS Title'); + expect(event.data.author).to.equal('John Doe'); + expect(event.data.title).to.equal('Example entry'); + expect(event.data.summary).to.equal('Here is some text containing an interesting description.'); + expect(event.data.link).to.equal('http://www.example.com/blog/post/1'); + expect(event.data.pubdate).to.equal('Sun, 6 Sep 2009 16:20:00 +0000'); + }); + it("should handle RSS feeds with a permalink url", async () => { + const { mq, feedReader} = constructFeedReader(() => ({ + headers: {}, data: ` + + + + RSS Title + This is an example of an RSS feed + http://www.example.com/main.html + 2020 Example.com All rights reserved + Mon, 6 Sep 2010 00:01:00 +0000 + Sun, 6 Sep 2009 16:20:00 +0000 + 1800 + + Example entry + John Doe + Here is some text containing an interesting description. + http://www.example.com/blog/post/1 + Sun, 6 Sep 2009 16:20:00 +0000 + + + + ` + })); + + const event: MessageQueueMessage = await new Promise((resolve) => { + mq.on('pushed', (data) => { resolve(data); feedReader.stop() }); + }); + + expect(event.eventName).to.equal('feed.entry'); + expect(event.data.feed.title).to.equal('RSS Title'); + expect(event.data.author).to.equal('John Doe'); + expect(event.data.title).to.equal('Example entry'); + expect(event.data.summary).to.equal('Here is some text containing an interesting description.'); + expect(event.data.link).to.equal('http://www.example.com/blog/post/1'); + expect(event.data.pubdate).to.equal('Sun, 6 Sep 2009 16:20:00 +0000'); + }); + it("should handle Atom feeds", async () => { + const { mq, feedReader} = constructFeedReader(() => ({ + headers: {}, data: ` + + + + Example Feed + + 2003-12-13T18:30:02Z + + John Doe + + urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6 + + + + John Doe + + Atom-Powered Robots Run Amok + + urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a + 2003-12-13T18:30:02Z + Some text. + + + + ` + })); + + const event: MessageQueueMessage = await new Promise((resolve) => { + mq.on('pushed', (data) => { resolve(data); feedReader.stop() }); + }); + + expect(event.eventName).to.equal('feed.entry'); + expect(event.data.feed.title).to.equal('Example Feed'); + expect(event.data.title).to.equal('Atom-Powered Robots Run Amok'); + expect(event.data.author).to.equal('John Doe'); + expect(event.data.summary).to.equal('Some text.'); + expect(event.data.link).to.equal('http://example.org/2003/12/13/atom03'); + expect(event.data.pubdate).to.equal('Sat, 13 Dec 2003 18:30:02 +0000'); + }); + it("should not duplicate feed entries", async () => { + const { mq, feedReader} = constructFeedReader(() => ({ + headers: {}, data: ` + + + + + John Doe + + Atom-Powered Robots Run Amok + + urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a + 2003-12-13T18:30:02Z + Some text. + + + ` + })); + + const events: MessageQueueMessage[] = []; + mq.on('pushed', (data) => { if (data.eventName === 'feed.entry') {events.push(data);} }); + await feedReader.pollFeed(FEED_URL); + await feedReader.pollFeed(FEED_URL); + await feedReader.pollFeed(FEED_URL); + feedReader.stop(); + expect(events).to.have.lengthOf(1); + }); }); diff --git a/yarn.lock b/yarn.lock index de369eee..66c67525 100644 --- a/yarn.lock +++ b/yarn.lock @@ -2661,7 +2661,7 @@ enquirer@^2.3.5: dependencies: ansi-colors "^4.1.1" -entities@^2.0.0, entities@^2.0.3: +entities@^2.0.0: version "2.2.0" resolved "https://registry.yarnpkg.com/entities/-/entities-2.2.0.tgz#098dc90ebb83d8dffa089d55256b351d34c4da55" integrity sha512-p92if5Nz619I0w+akJrLZH0MX0Pb5DX39XOwQTtXSdQQOaYH03S1uIQp4mhOZtAXrxq4ViO67YTiLBo2638o9A== @@ -5369,14 +5369,6 @@ rollup@^3.10.0: optionalDependencies: fsevents "~2.3.2" -rss-parser@^3.12.0: - version "3.12.0" - resolved "https://registry.yarnpkg.com/rss-parser/-/rss-parser-3.12.0.tgz#b8888699ea46304a74363fbd8144671b2997984c" - integrity sha512-aqD3E8iavcCdkhVxNDIdg1nkBI17jgqF+9OqPS1orwNaOgySdpvq6B+DoONLhzjzwV8mWg37sb60e4bmLK117A== - dependencies: - entities "^2.0.3" - xml2js "^0.4.19" - run-parallel@^1.1.9: version "1.2.0" resolved "https://registry.yarnpkg.com/run-parallel/-/run-parallel-1.2.0.tgz#66d1368da7bdf921eb9d95bd1a9229e7f21a43ee" @@ -6218,14 +6210,6 @@ write-file-atomic@^3.0.0: signal-exit "^3.0.2" typedarray-to-buffer "^3.1.5" -xml2js@^0.4.19: - version "0.4.23" - resolved "https://registry.yarnpkg.com/xml2js/-/xml2js-0.4.23.tgz#a0c69516752421eb2ac758ee4d4ccf58843eac66" - integrity sha512-ySPiMjM0+pLDftHgXY4By0uswI3SPKLDw/i3UXbnO8M/p28zqexCUoPmQFrYD+/1BzhGJSs2i1ERWKJAtiLrug== - dependencies: - sax ">=0.6.0" - xmlbuilder "~11.0.0" - xml2js@^0.5.0: version "0.5.0" resolved "https://registry.yarnpkg.com/xml2js/-/xml2js-0.5.0.tgz#d9440631fbb2ed800203fad106f2724f62c493b7"