From 507b014bd59893bd87cdb9d5b43ccf4bf4b20e30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadeusz=20So=C5=9Bnierz?= Date: Tue, 28 Mar 2023 11:06:30 +0200 Subject: [PATCH] Make sure we're not treating garbage data in feed item as guids (#689) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Make sure we're not treating garbage data in feed item as guids Some of these would parse as `{ '$': { isPermaLink: 'false' } }` or similar, which would then surprise us very much when assuming that it's a string stored in accountData. * Changelog * Exclude empty strings when looking for viable guids Co-authored-by: Christian Paul * Hash seen feed entry guids for storing and checking This saves a little bit of space in the common case, and prevents bloating the storage were someone to feed us obnoxiously long guids. --------- Co-authored-by: Tadeusz SoĊ›nierz Co-authored-by: Christian Paul --- changelog.d/687.bugfix | 1 + src/feeds/FeedReader.ts | 18 +++++++++++++++--- 2 files changed, 16 insertions(+), 3 deletions(-) create mode 100644 changelog.d/687.bugfix diff --git a/changelog.d/687.bugfix b/changelog.d/687.bugfix new file mode 100644 index 00000000..2965850f --- /dev/null +++ b/changelog.d/687.bugfix @@ -0,0 +1 @@ +Make sure we're not treating garbage data in feed items as guids. diff --git a/src/feeds/FeedReader.ts b/src/feeds/FeedReader.ts index 6d9fdc84..8faa78ee 100644 --- a/src/feeds/FeedReader.ts +++ b/src/feeds/FeedReader.ts @@ -12,6 +12,7 @@ import Metrics from "../Metrics"; import UserAgent from "../UserAgent"; import { randomUUID } from "crypto"; import { StatusCodes } from "http-status-codes"; +import { FormatUtil } from "../FormatUtil"; const log = new Logger("FeedReader"); @@ -242,22 +243,29 @@ export class FeedReader { seenGuids = []; seenEntriesChanged = true; // to ensure we only treat it as an initialSync once } + + // migrate legacy, cleartext guids to their md5-hashed counterparts + seenGuids = seenGuids.map(guid => guid.startsWith('md5:') ? guid : this.hashGuid(guid)); + const seenGuidsSet = new Set(seenGuids); const newGuids = []; log.debug(`Found ${feed.items.length} entries in ${url}`); for (const item of feed.items) { - const guid = item.guid || item.id || item.link || item.title; + // Find the first guid-like that looks like a string. + // Some feeds have a nasty habit of leading a empty tag there, making us parse it as garbage. + const guid = [item.guid, item.id, item.link, item.title].find(id => typeof id === 'string' && id); if (!guid) { log.error(`Could not determine guid for entry in ${url}, skipping`); continue; } - newGuids.push(guid); + const hashedGuid = this.hashGuid(guid); + newGuids.push(hashedGuid); if (initialSync) { log.debug(`Skipping entry ${guid} since we're performing an initial sync`); continue; } - if (seenGuidsSet.has(guid)) { + if (seenGuidsSet.has(hashedGuid)) { log.debug('Skipping already seen entry', guid); continue; } @@ -334,4 +342,8 @@ export class FeedReader { void this.pollFeeds(); }, sleepFor); } + + private hashGuid(guid: string): string { + return `md5:${FormatUtil.hashId(guid)}`; + } }