Fallback to parsing feeds as atom format if rss format fails. (#721)

* Support atom feeds in rust parser

* Add an apply linting command

* Add changelog

* Fixup

* Add tests for atom feeds + remove redundant code

* Remove unused rss-parser

* Tests for all formats.

* Move hashing logic into rust to save cross-context calls

* lint my rust

* Use a String::from

* Ensure guids are not repeated
This commit is contained in:
Will Hunt 2023-04-25 16:45:55 +01:00 committed by GitHub
parent ba5b28ab92
commit 43176adf7a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 317 additions and 124 deletions

1
Cargo.lock generated
View File

@ -279,6 +279,7 @@ dependencies = [
name = "matrix-hookshot"
version = "1.8.1"
dependencies = [
"atom_syndication",
"contrast",
"hex",
"md-5",

View File

@ -18,6 +18,7 @@ rgb = "0"
md-5 = "0.8.0"
hex = "0.4.3"
rss = "2.0.3"
atom_syndication = "0.12"
[build-dependencies]
napi-build = "1"

1
changelog.d/721.bugfix Normal file
View File

@ -0,0 +1 @@
Switch to using Rust for parsing RSS feeds.

View File

@ -36,6 +36,7 @@
"lint": "yarn run lint:js && yarn run lint:rs",
"lint:js": "eslint -c .eslintrc.js 'src/**/*.ts' 'tests/**/*.ts' 'web/**/*.ts' 'web/**/*.tsx'",
"lint:rs": "cargo fmt --all -- --check",
"lint:rs:apply": "cargo fmt --all",
"generate-default-config": "ts-node src/Config/Defaults.ts --config > config.sample.yml",
"validate-config": "ts-node src/Config/Config.ts"
},
@ -64,7 +65,6 @@
"p-queue": "^6.6.2",
"prom-client": "^14.0.1",
"reflect-metadata": "^0.1.13",
"rss-parser": "^3.12.0",
"source-map-support": "^0.5.21",
"string-argv": "^0.3.1",
"tiny-typed-emitter": "^2.1.0",

View File

@ -7,13 +7,12 @@ import { MessageQueue } from "../MessageQueue";
import Ajv from "ajv";
import axios, { AxiosResponse } from "axios";
import Parser from "rss-parser";
import Metrics from "../Metrics";
import UserAgent from "../UserAgent";
import { randomUUID } from "crypto";
import { StatusCodes } from "http-status-codes";
import { FormatUtil } from "../FormatUtil";
import { FeedItem, parseRSSFeed } from "../libRs";
import { JsRssChannel, parseFeed } from "../libRs";
const log = new Logger("FeedReader");
@ -111,10 +110,6 @@ function shuffle<T>(array: T[]): T[] {
export class FeedReader {
private static buildParser(): Parser {
return new Parser();
}
/**
* Read a feed URL and parse it into a set of items.
* @param url The feed URL.
@ -128,7 +123,7 @@ export class FeedReader {
headers: Record<string, string>,
timeoutMs: number,
httpClient = axios,
): Promise<{ response: AxiosResponse, feed: Parser.Output<FeedItem> }> {
): Promise<{ response: AxiosResponse, feed: JsRssChannel }> {
const response = await httpClient.get(url, {
headers: {
'User-Agent': UserAgent,
@ -141,35 +136,10 @@ export class FeedReader {
if (typeof response.data !== "string") {
throw Error('Unexpected response type');
}
const feed = parseRSSFeed(response.data);
const feed = parseFeed(response.data);
return { response, feed };
}
/**
* Attempt to parse a link from a feed item.
* @param item A feed item.
* @returns Return either a link to the item, or null.
*/
private static parseLinkFromItem(item: FeedItem) {
if (item.link) {
return item.link;
}
if (item.id && item.idIsPermalink) {
try {
// The feed librray doesn't give us attributes (needs isPermaLink), so we're not really sure if this a URL or not.
// Parse it and see.
// https://validator.w3.org/feed/docs/rss2.html#ltguidgtSubelementOfLtitemgt
const url = new URL(item.id);
return url.toString();
} catch (ex) {
return null;
}
}
return null;
}
private readonly parser = FeedReader.buildParser();
private connections: FeedConnection[];
// ts should notice that we do in fact initialize it in constructor, but it doesn't (in this version)
private observedFeedUrls: Set<string> = new Set();
@ -283,7 +253,7 @@ export class FeedReader {
* @param url The URL to be polled.
* @returns A boolean that returns if we saw any changes on the feed since the last poll time.
*/
private async pollFeed(url: string): Promise<boolean> {
public async pollFeed(url: string): Promise<boolean> {
let seenEntriesChanged = false;
const fetchKey = randomUUID();
const { etag, lastModified } = this.cacheTimes.get(url) || {};
@ -317,7 +287,6 @@ export class FeedReader {
// migrate legacy, cleartext guids to their md5-hashed counterparts
seenGuids = seenGuids.map(guid => guid.startsWith('md5:') ? guid : this.hashGuid(guid));
const seenGuidsSet = new Set(seenGuids);
const newGuids = [];
log.debug(`Found ${feed.items.length} entries in ${url}`);
@ -325,33 +294,31 @@ export class FeedReader {
for (const item of feed.items) {
// Find the first guid-like that looks like a string.
// Some feeds have a nasty habit of leading a empty tag there, making us parse it as garbage.
const guid = [item.id, item.link, item.title].find(isNonEmptyString);
if (!guid) {
if (!item.hashId) {
log.error(`Could not determine guid for entry in ${url}, skipping`);
continue;
}
const hashedGuid = this.hashGuid(guid);
newGuids.push(hashedGuid);
const hashId = `md5:${item.hashId}`;
newGuids.push(hashId);
if (initialSync) {
log.debug(`Skipping entry ${guid} since we're performing an initial sync`);
log.debug(`Skipping entry ${item.id ?? hashId} since we're performing an initial sync`);
continue;
}
if (seenGuidsSet.has(hashedGuid)) {
log.debug('Skipping already seen entry', guid);
if (seenGuidsSet.has(hashId)) {
log.debug('Skipping already seen entry', item.id ?? hashId);
continue;
}
const entry = {
feed: {
title: isNonEmptyString(feed.title) ? stripHtml(feed.title) : null,
url: url,
},
title: isNonEmptyString(item.title) ? stripHtml(item.title) : null,
pubdate: item.pubDate ?? null,
pubdate: item.pubdate ?? null,
summary: item.summary ?? null,
author: item.author ?? null,
link: FeedReader.parseLinkFromItem(item),
link: item.link ?? null,
fetchKey
};
@ -395,7 +362,10 @@ export class FeedReader {
return seenEntriesChanged;
}
private async pollFeeds(): Promise<void> {
/**
* Start polling all the feeds.
*/
public async pollFeeds(): Promise<void> {
log.debug(`Checking for updates in ${this.observedFeedUrls.size} RSS/Atom feeds`);
const fetchingStarted = Date.now();

View File

@ -1,8 +1,11 @@
use std::str::FromStr;
use atom_syndication::{Error as AtomError, Feed, Person};
use napi::bindgen_prelude::{Error as JsError, Status};
use rss::{Channel, Error as RssError};
use crate::format_util::hash_id;
#[derive(Serialize, Debug, Deserialize)]
#[napi(object)]
pub struct FeedItem {
@ -13,6 +16,7 @@ pub struct FeedItem {
pub pubdate: Option<String>,
pub summary: Option<String>,
pub author: Option<String>,
pub hash_id: Option<String>,
}
#[derive(Serialize, Debug, Deserialize)]
@ -21,45 +25,128 @@ pub struct JsRssChannel {
pub title: String,
pub items: Vec<FeedItem>,
}
#[napi(js_name = "parseRSSFeed")]
pub fn js_parse_rss_feed(xml: String) -> Result<JsRssChannel, JsError> {
fn map_item_value(original: &str) -> String {
original.to_string()
}
Channel::from_str(&xml)
.map(|channel| JsRssChannel {
title: channel.title().to_string(),
items: channel
.items()
.iter()
.map(|item| FeedItem {
title: item.title().map(map_item_value),
link: item.link().map(map_item_value),
id: item.guid().map(|f| f.value().to_string()),
id_is_permalink: item.guid().map_or(false, |f| f.is_permalink()),
pubdate: item.pub_date().map(map_item_value),
summary: item.description().map(map_item_value),
author: item.author().map(map_item_value),
})
.collect(),
})
.map_err(|op| match op {
RssError::Utf8(err) => JsError::new(
Status::Unknown,
format!("An error while converting bytes to UTF8. {}'", err).to_string(),
),
RssError::Xml(err) => JsError::new(
Status::Unknown,
format!("XML parsing error. {}", err).to_string(),
),
RssError::InvalidStartTag => JsError::new(
Status::Unknown,
format!("The input didn't begin with an opening <rss> tag.").to_string(),
),
err => JsError::new(
Status::Unknown,
format!("Unknown error trying to parse feed parse feed '{}'", err).to_string(),
),
})
fn parse_channel_to_js_result(channel: &Channel) -> JsRssChannel {
JsRssChannel {
title: channel.title().to_string(),
items: channel
.items()
.iter()
.map(|item: &rss::Item| FeedItem {
title: item.title().map(String::from),
link: item.link().and_then(|v| Some(v.to_string())).or_else(|| {
item.guid()
.and_then(|i| i.permalink.then(|| i.value.to_string()))
}),
id: item.guid().map(|f| f.value().to_string()),
id_is_permalink: item.guid().map_or(false, |f| f.is_permalink()),
pubdate: item.pub_date().map(String::from),
summary: item.description().map(String::from),
author: item.author().map(String::from),
hash_id: item
.guid
.clone()
.and_then(|f| Some(f.value))
.or(item.link.clone())
.or(item.title.clone())
.and_then(|f| hash_id(f).ok()),
})
.collect(),
}
}
fn parse_feed_to_js_result(feed: &Feed) -> JsRssChannel {
fn authors_to_string(persons: &[Person]) -> Option<String> {
if persons.len() == 0 {
return None;
}
let mut outs = Vec::<String>::new();
for person in persons {
let email = person
.email
.clone()
.map_or_else(|| String::new(), |v| format!("<{}>", v));
let uri = person
.uri
.clone()
.map_or_else(|| String::new(), |v| format!("<{}>", v));
outs.push(format!("{}{}{}", person.name, email, uri))
}
Some(outs.join(", "))
}
JsRssChannel {
title: feed.title().to_string(),
items: feed
.entries()
.iter()
.map(|item| FeedItem {
title: Some(item.title().value.clone()),
link: item.links().first().map(|f| f.href.clone()),
id: Some(item.id.clone()),
// No equivalent
id_is_permalink: false,
pubdate: item
.published
.or(Some(item.updated))
.map(|date| date.to_rfc2822()),
summary: item.summary().map(|v| v.value.clone()),
author: authors_to_string(item.authors()),
hash_id: hash_id(item.id.clone()).ok(),
})
.collect(),
}
}
#[napi(js_name = "parseFeed")]
pub fn js_parse_feed(xml: String) -> Result<JsRssChannel, JsError> {
match Channel::from_str(&xml) {
Ok(channel) => Ok(parse_channel_to_js_result(&channel)),
Err(RssError::InvalidStartTag) =>
// If the tag is wrong, parse again as a feed.
{
match Feed::from_str(&xml) {
Ok(feed) => Ok(parse_feed_to_js_result(&feed)),
Err(AtomError::Eof) => Err(JsError::new(
Status::Unknown,
format!("Unexpected end of input.").to_string(),
)),
Err(AtomError::InvalidStartTag) => Err(JsError::new(
Status::Unknown,
format!("An error while converting bytes to UTF8.").to_string(),
)),
Err(AtomError::WrongAttribute { attribute, value }) => Err(JsError::new(
Status::Unknown,
format!(
"The attribute '{}' had the wrong value '{}'",
attribute, value
)
.to_string(),
)),
Err(AtomError::WrongDatetime(value)) => Err(JsError::new(
Status::Unknown,
format!("The format of the datetime ('{}') was wrong.", value).to_string(),
)),
Err(AtomError::Xml(err)) => Err(JsError::new(
Status::Unknown,
format!("XML parsing error . {}'", err).to_string(),
)),
Err(err) => Err(JsError::new(
Status::Unknown,
format!("Unknown error trying to parse feed parse feed '{}'", err).to_string(),
)),
}
}
Err(RssError::Utf8(err)) => Err(JsError::new(
Status::Unknown,
format!("An error while converting bytes to UTF8. {}'", err).to_string(),
)),
Err(RssError::Xml(err)) => Err(JsError::new(
Status::Unknown,
format!("XML parsing error. {}", err).to_string(),
)),
Err(RssError::Eof) => Err(JsError::new(
Status::Unknown,
format!("Unexpected end of input").to_string(),
)),
}
}

View File

@ -4,7 +4,7 @@ import EventEmitter from "events";
import { BridgeConfigFeeds } from "../src/Config/Config";
import { ConnectionManager } from "../src/ConnectionManager";
import { IConnection } from "../src/Connections";
import { FeedReader } from "../src/feeds/FeedReader";
import { FeedEntry, FeedReader } from "../src/feeds/FeedReader";
import { MessageQueue, MessageQueueMessage } from "../src/MessageQueue";
class MockConnectionManager extends EventEmitter {
@ -45,17 +45,31 @@ class MockHttpClient {
}
}
const FEED_URL = 'http://test/';
function constructFeedReader(feedResponse: () => {headers: Record<string,string>, data: string}) {
const config = new BridgeConfigFeeds({
enabled: true,
pollIntervalSeconds: 1,
pollTimeoutSeconds: 1,
});
const cm = new MockConnectionManager([{ feedUrl: FEED_URL } as unknown as IConnection]) as unknown as ConnectionManager
const mq = new MockMessageQueue();
const feedReader = new FeedReader(
config, cm, mq,
{
getAccountData: <T>() => Promise.resolve({ [FEED_URL]: [] } as unknown as T),
setAccountData: () => Promise.resolve(),
},
new MockHttpClient({ ...feedResponse() } as AxiosResponse) as unknown as AxiosStatic,
);
return {config, cm, mq, feedReader};
}
describe("FeedReader", () => {
it("should correctly handle empty titles", async () => {
const config = new BridgeConfigFeeds({
enabled: true,
pollIntervalSeconds: 1,
pollTimeoutSeconds: 1,
});
const cm = new MockConnectionManager([{ feedUrl: 'http://test/' } as unknown as IConnection]) as unknown as ConnectionManager
const mq = new MockMessageQueue();
const feedContents = `
const { mq, feedReader} = constructFeedReader(() => ({
headers: {}, data: `
<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
<channel><title type='text'></title><description>test feed</description><link>http://test/</link>
@ -67,16 +81,8 @@ describe("FeedReader", () => {
<pubDate>Wed, 12 Apr 2023 09:53:00 GMT</pubDate>
</item>
</channel></rss>
`;
const feedReader = new FeedReader(
config, cm, mq,
{
getAccountData: <T>() => Promise.resolve({ 'http://test/': [] } as unknown as T),
setAccountData: () => Promise.resolve(),
},
new MockHttpClient({ headers: {}, data: feedContents } as AxiosResponse) as unknown as AxiosStatic,
);
`
}));
const event: any = await new Promise((resolve) => {
mq.on('pushed', (data) => { resolve(data); feedReader.stop() });
@ -86,4 +92,147 @@ describe("FeedReader", () => {
expect(event.data.feed.title).to.equal(null);
expect(event.data.title).to.equal(null);
});
it("should handle RSS 2.0 feeds", async () => {
const { mq, feedReader} = constructFeedReader(() => ({
headers: {}, data: `
<?xml version="1.0" encoding="UTF-8" ?>
<rss version="2.0">
<channel>
<title>RSS Title</title>
<description>This is an example of an RSS feed</description>
<link>http://www.example.com/main.html</link>
<copyright>2020 Example.com All rights reserved</copyright>
<lastBuildDate>Mon, 6 Sep 2010 00:01:00 +0000</lastBuildDate>
<pubDate>Sun, 6 Sep 2009 16:20:00 +0000</pubDate>
<ttl>1800</ttl>
<item>
<title>Example entry</title>
<author>John Doe</author>
<description>Here is some text containing an interesting description.</description>
<link>http://www.example.com/blog/post/1</link>
<guid isPermaLink="false">7bd204c6-1655-4c27-aeee-53f933c5395f</guid>
<pubDate>Sun, 6 Sep 2009 16:20:00 +0000</pubDate>
</item>
</channel>
</rss>
`
}));
const event: MessageQueueMessage<FeedEntry> = await new Promise((resolve) => {
mq.on('pushed', (data) => { resolve(data); feedReader.stop() });
});
expect(event.eventName).to.equal('feed.entry');
expect(event.data.feed.title).to.equal('RSS Title');
expect(event.data.author).to.equal('John Doe');
expect(event.data.title).to.equal('Example entry');
expect(event.data.summary).to.equal('Here is some text containing an interesting description.');
expect(event.data.link).to.equal('http://www.example.com/blog/post/1');
expect(event.data.pubdate).to.equal('Sun, 6 Sep 2009 16:20:00 +0000');
});
it("should handle RSS feeds with a permalink url", async () => {
const { mq, feedReader} = constructFeedReader(() => ({
headers: {}, data: `
<?xml version="1.0" encoding="UTF-8" ?>
<rss version="2.0">
<channel>
<title>RSS Title</title>
<description>This is an example of an RSS feed</description>
<link>http://www.example.com/main.html</link>
<copyright>2020 Example.com All rights reserved</copyright>
<lastBuildDate>Mon, 6 Sep 2010 00:01:00 +0000</lastBuildDate>
<pubDate>Sun, 6 Sep 2009 16:20:00 +0000</pubDate>
<ttl>1800</ttl>
<item>
<title>Example entry</title>
<author>John Doe</author>
<description>Here is some text containing an interesting description.</description>
<guid isPermaLink="true">http://www.example.com/blog/post/1</guid>
<pubDate>Sun, 6 Sep 2009 16:20:00 +0000</pubDate>
</item>
</channel>
</rss>
`
}));
const event: MessageQueueMessage<FeedEntry> = await new Promise((resolve) => {
mq.on('pushed', (data) => { resolve(data); feedReader.stop() });
});
expect(event.eventName).to.equal('feed.entry');
expect(event.data.feed.title).to.equal('RSS Title');
expect(event.data.author).to.equal('John Doe');
expect(event.data.title).to.equal('Example entry');
expect(event.data.summary).to.equal('Here is some text containing an interesting description.');
expect(event.data.link).to.equal('http://www.example.com/blog/post/1');
expect(event.data.pubdate).to.equal('Sun, 6 Sep 2009 16:20:00 +0000');
});
it("should handle Atom feeds", async () => {
const { mq, feedReader} = constructFeedReader(() => ({
headers: {}, data: `
<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>Example Feed</title>
<link href="http://example.org/"/>
<updated>2003-12-13T18:30:02Z</updated>
<author>
<name>John Doe</name>
</author>
<id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>
<entry>
<author>
<name>John Doe</name>
</author>
<title>Atom-Powered Robots Run Amok</title>
<link href="http://example.org/2003/12/13/atom03"/>
<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
<updated>2003-12-13T18:30:02Z</updated>
<summary>Some text.</summary>
</entry>
</feed>
`
}));
const event: MessageQueueMessage<FeedEntry> = await new Promise((resolve) => {
mq.on('pushed', (data) => { resolve(data); feedReader.stop() });
});
expect(event.eventName).to.equal('feed.entry');
expect(event.data.feed.title).to.equal('Example Feed');
expect(event.data.title).to.equal('Atom-Powered Robots Run Amok');
expect(event.data.author).to.equal('John Doe');
expect(event.data.summary).to.equal('Some text.');
expect(event.data.link).to.equal('http://example.org/2003/12/13/atom03');
expect(event.data.pubdate).to.equal('Sat, 13 Dec 2003 18:30:02 +0000');
});
it("should not duplicate feed entries", async () => {
const { mq, feedReader} = constructFeedReader(() => ({
headers: {}, data: `
<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<entry>
<author>
<name>John Doe</name>
</author>
<title>Atom-Powered Robots Run Amok</title>
<link href="http://example.org/2003/12/13/atom03"/>
<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
<updated>2003-12-13T18:30:02Z</updated>
<summary>Some text.</summary>
</entry>
</feed>
`
}));
const events: MessageQueueMessage<FeedEntry>[] = [];
mq.on('pushed', (data) => { if (data.eventName === 'feed.entry') {events.push(data);} });
await feedReader.pollFeed(FEED_URL);
await feedReader.pollFeed(FEED_URL);
await feedReader.pollFeed(FEED_URL);
feedReader.stop();
expect(events).to.have.lengthOf(1);
});
});

View File

@ -2661,7 +2661,7 @@ enquirer@^2.3.5:
dependencies:
ansi-colors "^4.1.1"
entities@^2.0.0, entities@^2.0.3:
entities@^2.0.0:
version "2.2.0"
resolved "https://registry.yarnpkg.com/entities/-/entities-2.2.0.tgz#098dc90ebb83d8dffa089d55256b351d34c4da55"
integrity sha512-p92if5Nz619I0w+akJrLZH0MX0Pb5DX39XOwQTtXSdQQOaYH03S1uIQp4mhOZtAXrxq4ViO67YTiLBo2638o9A==
@ -5369,14 +5369,6 @@ rollup@^3.10.0:
optionalDependencies:
fsevents "~2.3.2"
rss-parser@^3.12.0:
version "3.12.0"
resolved "https://registry.yarnpkg.com/rss-parser/-/rss-parser-3.12.0.tgz#b8888699ea46304a74363fbd8144671b2997984c"
integrity sha512-aqD3E8iavcCdkhVxNDIdg1nkBI17jgqF+9OqPS1orwNaOgySdpvq6B+DoONLhzjzwV8mWg37sb60e4bmLK117A==
dependencies:
entities "^2.0.3"
xml2js "^0.4.19"
run-parallel@^1.1.9:
version "1.2.0"
resolved "https://registry.yarnpkg.com/run-parallel/-/run-parallel-1.2.0.tgz#66d1368da7bdf921eb9d95bd1a9229e7f21a43ee"
@ -6218,14 +6210,6 @@ write-file-atomic@^3.0.0:
signal-exit "^3.0.2"
typedarray-to-buffer "^3.1.5"
xml2js@^0.4.19:
version "0.4.23"
resolved "https://registry.yarnpkg.com/xml2js/-/xml2js-0.4.23.tgz#a0c69516752421eb2ac758ee4d4ccf58843eac66"
integrity sha512-ySPiMjM0+pLDftHgXY4By0uswI3SPKLDw/i3UXbnO8M/p28zqexCUoPmQFrYD+/1BzhGJSs2i1ERWKJAtiLrug==
dependencies:
sax ">=0.6.0"
xmlbuilder "~11.0.0"
xml2js@^0.5.0:
version "0.5.0"
resolved "https://registry.yarnpkg.com/xml2js/-/xml2js-0.5.0.tgz#d9440631fbb2ed800203fad106f2724f62c493b7"