Remove duplicates from downloaded message list (#650)
Some downloaded feeds contain multiple entries for the same message. See `http://feeds.feedburner.com/abseilio` for an example that has: ``` <pubDate>2020-09-11T00:00:00-04:00</pubDate> <link>https://abseil.io/tips/5</link> -- <pubDate>2020-06-01T00:00:00-04:00</pubDate> <link>https://abseil.io/tips/5</link> ``` When updating the database for the first time, both messages end up stored. The following feed updates result in one entry matching the database, and the second entry having a different creation date. This makes the second entry always marked as unread.
This commit is contained in:
parent
aee9394f98
commit
7b19910e0b
|
@ -348,6 +348,8 @@ void FeedDownloader::updateOneFeed(ServiceRoot* acc,
|
|||
}
|
||||
}
|
||||
|
||||
removeDuplicateMessages(msgs);
|
||||
|
||||
// Now make sure, that messages are actually stored to SQL in a locked state.
|
||||
qDebugNN << LOGSEC_FEEDDOWNLOADER << "Saving messages of feed ID '"
|
||||
<< feed->customId() << "' URL: '" << feed->source() << "' title: '" << feed->title() << "' in thread: '"
|
||||
|
@ -418,6 +420,59 @@ bool FeedDownloader::isCacheSynchronizationRunning() const {
|
|||
return m_isCacheSynchronizationRunning;
|
||||
}
|
||||
|
||||
void FeedDownloader::removeDuplicateMessages(QList<Message>& messages) {
|
||||
auto idx = 0;
|
||||
while (idx < messages.size()) {
|
||||
Message& message = messages[idx];
|
||||
std::function<bool(const Message& a, const Message& b)> is_duplicate;
|
||||
if (message.m_id > 0) {
|
||||
is_duplicate = [](const Message& a, const Message& b) {
|
||||
return a.m_id == b.m_id;
|
||||
};
|
||||
}
|
||||
else if (message.m_customId.isEmpty()) {
|
||||
is_duplicate = [](const Message& a, const Message& b) {
|
||||
return std::tie(a.m_title, a.m_url, a.m_author) == std::tie(b.m_title, b.m_url, b.m_author);
|
||||
};
|
||||
}
|
||||
else {
|
||||
is_duplicate = [](const Message& a, const Message& b) {
|
||||
return a.m_customId == b.m_customId;
|
||||
};
|
||||
}
|
||||
auto next_idx = idx + 1; // Index of next message to check after removing all duplicates.
|
||||
auto last_idx = idx; // Index of the last kept duplicate.
|
||||
idx = next_idx;
|
||||
|
||||
// Remove all duplicate messages, and keep the message with the latest created date.
|
||||
// If the created date is identical for all duplicate messages then keep the last message in the list.
|
||||
while (idx < messages.size()) {
|
||||
auto& last_duplicate = messages[last_idx];
|
||||
if (is_duplicate(last_duplicate, messages[idx])) {
|
||||
if (last_duplicate.m_created <= messages[idx].m_created) {
|
||||
// The last seen message was created earlier or at the same date -- keep the current, and remove the last.
|
||||
messages.removeAt(last_idx);
|
||||
if (last_idx + 1 == next_idx) {
|
||||
// The `next_idx` was pointing to the message following the duplicate. With that duplicate removed the
|
||||
// next index needs to be adjusted.
|
||||
next_idx = last_idx;
|
||||
}
|
||||
last_idx = idx;
|
||||
++idx;
|
||||
}
|
||||
else {
|
||||
messages.removeAt(idx);
|
||||
}
|
||||
}
|
||||
else {
|
||||
++idx;
|
||||
}
|
||||
}
|
||||
|
||||
idx = next_idx;
|
||||
}
|
||||
}
|
||||
|
||||
QString FeedDownloadResults::overview(int how_many_feeds) const {
|
||||
QStringList result;
|
||||
|
||||
|
|
|
@ -59,6 +59,7 @@ class FeedDownloader : public QObject {
|
|||
const QHash<ServiceRoot::BagOfMessages, QStringList>& stated_messages,
|
||||
const QHash<QString, QStringList>& tagged_messages);
|
||||
void finalizeUpdate();
|
||||
static void removeDuplicateMessages(QList<Message>& messages);
|
||||
|
||||
bool m_isCacheSynchronizationRunning;
|
||||
bool m_stopCacheSynchronization;
|
||||
|
|
Loading…
Reference in New Issue