Remove duplicates from downloaded message list (#650)

Some downloaded feeds contain multiple entries for the same message.
See `http://feeds.feedburner.com/abseilio` for an example that has:

```
          <pubDate>2020-09-11T00:00:00-04:00</pubDate>
          <link>https://abseil.io/tips/5</link>
--
          <pubDate>2020-06-01T00:00:00-04:00</pubDate>
          <link>https://abseil.io/tips/5</link>
```

When updating the database for the first time, both messages end up
stored. The following feed updates result in one entry matching the
database, and the second entry having a different creation date. This
makes the second entry always marked as unread.
This commit is contained in:
igrekster 2022-02-22 18:55:30 +11:00 committed by GitHub
parent aee9394f98
commit 7b19910e0b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 56 additions and 0 deletions

View File

@ -348,6 +348,8 @@ void FeedDownloader::updateOneFeed(ServiceRoot* acc,
}
}
removeDuplicateMessages(msgs);
// Now make sure, that messages are actually stored to SQL in a locked state.
qDebugNN << LOGSEC_FEEDDOWNLOADER << "Saving messages of feed ID '"
<< feed->customId() << "' URL: '" << feed->source() << "' title: '" << feed->title() << "' in thread: '"
@ -418,6 +420,59 @@ bool FeedDownloader::isCacheSynchronizationRunning() const {
return m_isCacheSynchronizationRunning;
}
void FeedDownloader::removeDuplicateMessages(QList<Message>& messages) {
auto idx = 0;
while (idx < messages.size()) {
Message& message = messages[idx];
std::function<bool(const Message& a, const Message& b)> is_duplicate;
if (message.m_id > 0) {
is_duplicate = [](const Message& a, const Message& b) {
return a.m_id == b.m_id;
};
}
else if (message.m_customId.isEmpty()) {
is_duplicate = [](const Message& a, const Message& b) {
return std::tie(a.m_title, a.m_url, a.m_author) == std::tie(b.m_title, b.m_url, b.m_author);
};
}
else {
is_duplicate = [](const Message& a, const Message& b) {
return a.m_customId == b.m_customId;
};
}
auto next_idx = idx + 1; // Index of next message to check after removing all duplicates.
auto last_idx = idx; // Index of the last kept duplicate.
idx = next_idx;
// Remove all duplicate messages, and keep the message with the latest created date.
// If the created date is identical for all duplicate messages then keep the last message in the list.
while (idx < messages.size()) {
auto& last_duplicate = messages[last_idx];
if (is_duplicate(last_duplicate, messages[idx])) {
if (last_duplicate.m_created <= messages[idx].m_created) {
// The last seen message was created earlier or at the same date -- keep the current, and remove the last.
messages.removeAt(last_idx);
if (last_idx + 1 == next_idx) {
// The `next_idx` was pointing to the message following the duplicate. With that duplicate removed the
// next index needs to be adjusted.
next_idx = last_idx;
}
last_idx = idx;
++idx;
}
else {
messages.removeAt(idx);
}
}
else {
++idx;
}
}
idx = next_idx;
}
}
QString FeedDownloadResults::overview(int how_many_feeds) const {
QStringList result;

View File

@ -59,6 +59,7 @@ class FeedDownloader : public QObject {
const QHash<ServiceRoot::BagOfMessages, QStringList>& stated_messages,
const QHash<QString, QStringList>& tagged_messages);
void finalizeUpdate();
static void removeDuplicateMessages(QList<Message>& messages);
bool m_isCacheSynchronizationRunning;
bool m_stopCacheSynchronization;