Don't process feed/item titles that aren't actually strings (#708)

* Don't process feed/item titles that aren't actually strings

Another case of empty-tag with attributes, these would occasionally parse
into non-string objects and make us crash when trying to stripHtml on them.

* Changelog

* Add tests for not-quite-empty <title> tags in feeds

---------

Co-authored-by: Tadeusz Sośnierz <tadeusz@sosnierz.com>
This commit is contained in:
Tadeusz Sośnierz 2023-04-12 17:52:10 +02:00 committed by GitHub
parent a634cdeed8
commit bc57dbbc83
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 109 additions and 7 deletions

1
changelog.d/708.bugfix Normal file
View File

@ -0,0 +1 @@
Fix mishandling of empty feed/item title tags.

View File

@ -68,6 +68,11 @@ interface AccountData {
[url: string]: string[],
}
interface AccountDataStore {
getAccountData<T>(type: string): Promise<T>;
setAccountData<T>(type: string, data: T): Promise<void>;
}
const accountDataSchema = {
type: 'object',
patternProperties: {
@ -81,6 +86,10 @@ const accountDataSchema = {
const ajv = new Ajv();
const validateAccountData = ajv.compile<AccountData>(accountDataSchema);
function isNonEmptyString(input: any): input is string {
return input && typeof input === 'string';
}
function stripHtml(input: string): string {
return input.replace(/<[^>]*?>/g, '');
}
@ -123,8 +132,9 @@ export class FeedReader {
headers: Record<string, string>,
timeoutMs: number,
parser: Parser = FeedReader.buildParser(),
httpClient = axios,
): Promise<{ response: AxiosResponse, feed: Parser.Output<FeedItem> }> {
const response = await axios.get(url, {
const response = await httpClient.get(url, {
headers: {
'User-Agent': UserAgent,
...headers,
@ -188,7 +198,8 @@ export class FeedReader {
private readonly config: BridgeConfigFeeds,
private readonly connectionManager: ConnectionManager,
private readonly queue: MessageQueue,
private readonly matrixClient: MatrixClient,
private readonly accountDataStore: AccountDataStore,
private readonly httpClient = axios,
) {
this.connections = this.connectionManager.getAllConnectionsOfType(FeedConnection);
this.calculateFeedUrls();
@ -237,7 +248,7 @@ export class FeedReader {
private async loadSeenEntries(): Promise<void> {
try {
const accountData = await this.matrixClient.getAccountData<AccountData>(FeedReader.seenEntriesEventType).catch((err: MatrixError|unknown) => {
const accountData = await this.accountDataStore.getAccountData<AccountData>(FeedReader.seenEntriesEventType).catch((err: MatrixError|unknown) => {
if (err instanceof MatrixError && err.statusCode === 404) {
return {} as AccountData;
} else {
@ -262,7 +273,7 @@ export class FeedReader {
for (const [url, guids] of this.seenEntries.entries()) {
accountData[url.toString()] = guids;
}
await this.matrixClient.setAccountData(FeedReader.seenEntriesEventType, accountData);
await this.accountDataStore.setAccountData(FeedReader.seenEntriesEventType, accountData);
}
/**
@ -288,6 +299,7 @@ export class FeedReader {
// We don't want to wait forever for the feed.
this.config.pollTimeoutSeconds * 1000,
this.parser,
this.httpClient,
);
// Store any entity tags/cache times.
@ -315,7 +327,7 @@ export class FeedReader {
for (const item of feed.items) {
// Find the first guid-like that looks like a string.
// Some feeds have a nasty habit of leading a empty tag there, making us parse it as garbage.
const guid = [item.guid, item.id, item.link, item.title].find(id => typeof id === 'string' && id);
const guid = [item.guid, item.id, item.link, item.title].find(isNonEmptyString);
if (!guid) {
log.error(`Could not determine guid for entry in ${url}, skipping`);
continue;
@ -334,10 +346,10 @@ export class FeedReader {
const entry = {
feed: {
title: feed.title ? stripHtml(feed.title) : null,
title: isNonEmptyString(feed.title) ? stripHtml(feed.title) : null,
url: url,
},
title: item.title ? stripHtml(item.title) : null,
title: isNonEmptyString(item.title) ? stripHtml(item.title) : null,
pubdate: item.pubDate ?? null,
summary: item.summary ?? null,
author: item.creator ?? null,

89
tests/FeedReader.spec.ts Normal file
View File

@ -0,0 +1,89 @@
import { AxiosResponse, AxiosStatic } from "axios";
import { expect } from "chai";
import EventEmitter from "events";
import { BridgeConfigFeeds } from "../src/Config/Config";
import { ConnectionManager } from "../src/ConnectionManager";
import { IConnection } from "../src/Connections";
import { FeedReader } from "../src/feeds/FeedReader";
import { MessageQueue, MessageQueueMessage } from "../src/MessageQueue";
class MockConnectionManager extends EventEmitter {
constructor(
public connections: IConnection[]
) {
super();
}
getAllConnectionsOfType(type: unknown) {
return this.connections;
}
}
class MockMessageQueue extends EventEmitter implements MessageQueue {
subscribe(eventGlob: string): void {
this.emit('subscribed', eventGlob);
}
unsubscribe(eventGlob: string): void {
this.emit('unsubscribed', eventGlob);
}
async push(data: MessageQueueMessage<unknown>, single?: boolean): Promise<void> {
this.emit('pushed', data, single);
}
async pushWait<T, X>(data: MessageQueueMessage<T>, timeout?: number, single?: boolean): Promise<X> {
throw new Error('Not yet implemented');
}
}
class MockHttpClient {
constructor(public response: AxiosResponse) {}
get(): Promise<AxiosResponse> {
return Promise.resolve(this.response);
}
}
describe("FeedReader", () => {
it("should correctly handle empty titles", async () => {
const config = new BridgeConfigFeeds({
enabled: true,
pollIntervalSeconds: 1,
pollTimeoutSeconds: 1,
});
const cm = new MockConnectionManager([{ feedUrl: 'http://test/' } as unknown as IConnection]) as unknown as ConnectionManager
const mq = new MockMessageQueue();
const feedContents = `
<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
<channel><title type='text'></title><description>test feed</description><link>http://test/</link>
<pubDate>Wed, 12 Apr 2023 09:53:00 GMT</pubDate>
<item>
<title type='text'></title><description>test item</description>
<link>http://example.com/test/1681293180</link>
<guid isPermaLink="true">http://example.com/test/1681293180</guid>
<pubDate>Wed, 12 Apr 2023 09:53:00 GMT</pubDate>
</item>
</channel></rss>
`;
const feedReader = new FeedReader(
config, cm, mq,
{
getAccountData: <T>() => Promise.resolve({ 'http://test/': [] } as unknown as T),
setAccountData: <T>() => Promise.resolve(),
},
new MockHttpClient({ headers: {}, data: feedContents } as AxiosResponse) as unknown as AxiosStatic,
);
const event: any = await new Promise((resolve) => {
mq.on('pushed', (data, _) => { resolve(data); feedReader.stop() });
});
expect(event.eventName).to.equal('feed.entry');
expect(event.data.feed.title).to.equal(null);
expect(event.data.title).to.equal(null);
});
});