Remove duplicates from downloaded message list (#650)
Some downloaded feeds contain multiple entries for the same message. See `http://feeds.feedburner.com/abseilio` for an example that has: ``` <pubDate>2020-09-11T00:00:00-04:00</pubDate> <link>https://abseil.io/tips/5</link> -- <pubDate>2020-06-01T00:00:00-04:00</pubDate> <link>https://abseil.io/tips/5</link> ``` When updating the database for the first time, both messages end up stored. The following feed updates result in one entry matching the database, and the second entry having a different creation date. This makes the second entry always marked as unread.
This commit is contained in:
parent
aee9394f98
commit
7b19910e0b
|
@ -348,6 +348,8 @@ void FeedDownloader::updateOneFeed(ServiceRoot* acc,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
removeDuplicateMessages(msgs);
|
||||||
|
|
||||||
// Now make sure, that messages are actually stored to SQL in a locked state.
|
// Now make sure, that messages are actually stored to SQL in a locked state.
|
||||||
qDebugNN << LOGSEC_FEEDDOWNLOADER << "Saving messages of feed ID '"
|
qDebugNN << LOGSEC_FEEDDOWNLOADER << "Saving messages of feed ID '"
|
||||||
<< feed->customId() << "' URL: '" << feed->source() << "' title: '" << feed->title() << "' in thread: '"
|
<< feed->customId() << "' URL: '" << feed->source() << "' title: '" << feed->title() << "' in thread: '"
|
||||||
|
@ -418,6 +420,59 @@ bool FeedDownloader::isCacheSynchronizationRunning() const {
|
||||||
return m_isCacheSynchronizationRunning;
|
return m_isCacheSynchronizationRunning;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void FeedDownloader::removeDuplicateMessages(QList<Message>& messages) {
|
||||||
|
auto idx = 0;
|
||||||
|
while (idx < messages.size()) {
|
||||||
|
Message& message = messages[idx];
|
||||||
|
std::function<bool(const Message& a, const Message& b)> is_duplicate;
|
||||||
|
if (message.m_id > 0) {
|
||||||
|
is_duplicate = [](const Message& a, const Message& b) {
|
||||||
|
return a.m_id == b.m_id;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
else if (message.m_customId.isEmpty()) {
|
||||||
|
is_duplicate = [](const Message& a, const Message& b) {
|
||||||
|
return std::tie(a.m_title, a.m_url, a.m_author) == std::tie(b.m_title, b.m_url, b.m_author);
|
||||||
|
};
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
is_duplicate = [](const Message& a, const Message& b) {
|
||||||
|
return a.m_customId == b.m_customId;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
auto next_idx = idx + 1; // Index of next message to check after removing all duplicates.
|
||||||
|
auto last_idx = idx; // Index of the last kept duplicate.
|
||||||
|
idx = next_idx;
|
||||||
|
|
||||||
|
// Remove all duplicate messages, and keep the message with the latest created date.
|
||||||
|
// If the created date is identical for all duplicate messages then keep the last message in the list.
|
||||||
|
while (idx < messages.size()) {
|
||||||
|
auto& last_duplicate = messages[last_idx];
|
||||||
|
if (is_duplicate(last_duplicate, messages[idx])) {
|
||||||
|
if (last_duplicate.m_created <= messages[idx].m_created) {
|
||||||
|
// The last seen message was created earlier or at the same date -- keep the current, and remove the last.
|
||||||
|
messages.removeAt(last_idx);
|
||||||
|
if (last_idx + 1 == next_idx) {
|
||||||
|
// The `next_idx` was pointing to the message following the duplicate. With that duplicate removed the
|
||||||
|
// next index needs to be adjusted.
|
||||||
|
next_idx = last_idx;
|
||||||
|
}
|
||||||
|
last_idx = idx;
|
||||||
|
++idx;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
messages.removeAt(idx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
++idx;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
idx = next_idx;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
QString FeedDownloadResults::overview(int how_many_feeds) const {
|
QString FeedDownloadResults::overview(int how_many_feeds) const {
|
||||||
QStringList result;
|
QStringList result;
|
||||||
|
|
||||||
|
|
|
@ -59,6 +59,7 @@ class FeedDownloader : public QObject {
|
||||||
const QHash<ServiceRoot::BagOfMessages, QStringList>& stated_messages,
|
const QHash<ServiceRoot::BagOfMessages, QStringList>& stated_messages,
|
||||||
const QHash<QString, QStringList>& tagged_messages);
|
const QHash<QString, QStringList>& tagged_messages);
|
||||||
void finalizeUpdate();
|
void finalizeUpdate();
|
||||||
|
static void removeDuplicateMessages(QList<Message>& messages);
|
||||||
|
|
||||||
bool m_isCacheSynchronizationRunning;
|
bool m_isCacheSynchronizationRunning;
|
||||||
bool m_stopCacheSynchronization;
|
bool m_stopCacheSynchronization;
|
||||||
|
|
Loading…
Reference in New Issue