Improvements related to duplicate detection (#5387)

* Move duplicate detection to one single place
* Canonicalize some common characters that are often confused
* Assume same episode even when date is off by 1 week
* Display duplicate detection as warning, not error
This commit is contained in:
ByteHamster 2021-09-06 17:59:17 +02:00 committed by GitHub
parent b9f578ed5c
commit b36cdb0c4e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 82 additions and 97 deletions

View File

@ -21,6 +21,7 @@ import de.danoeh.antennapod.core.storage.DBReader;
import de.danoeh.antennapod.core.storage.DBTasks;
import de.danoeh.antennapod.core.storage.DownloadRequestException;
import de.danoeh.antennapod.core.storage.DownloadRequester;
import de.danoeh.antennapod.core.util.DownloadError;
import de.danoeh.antennapod.model.feed.Feed;
import de.danoeh.antennapod.model.feed.FeedMedia;
import de.danoeh.antennapod.ui.common.ThemeUtils;
@ -101,8 +102,13 @@ public class DownloadLogAdapter extends BaseAdapter {
holder.reason.setVisibility(View.GONE);
holder.tapForDetails.setVisibility(View.GONE);
} else {
holder.icon.setTextColor(ContextCompat.getColor(context, R.color.download_failed_red));
holder.icon.setText("{fa-times-circle}");
if (status.getReason() == DownloadError.ERROR_PARSER_EXCEPTION_DUPLICATE) {
holder.icon.setTextColor(ContextCompat.getColor(context, R.color.download_warning_yellow));
holder.icon.setText("{fa-exclamation-circle}");
} else {
holder.icon.setTextColor(ContextCompat.getColor(context, R.color.download_failed_red));
holder.icon.setText("{fa-times-circle}");
}
holder.icon.setContentDescription(context.getString(R.string.error_label));
holder.reason.setText(status.getReason().getErrorString(context));
holder.reason.setVisibility(View.VISIBLE);

View File

@ -335,59 +335,56 @@ public final class DBTasks {
}
/**
* Get a FeedItem by its identifying value or download_url.
* For de-duplicating items that are not stored yet, see also FeedHandler.dedupItems
* Get a FeedItem by its identifying value.
*/
private static FeedItem searchFeedItemByIdentifyingValue(Context context, Feed feed, FeedItem searchItem) {
for (FeedItem item : feed.getItems()) {
private static FeedItem searchFeedItemByIdentifyingValue(List<FeedItem> items, FeedItem searchItem) {
for (FeedItem item : items) {
if (TextUtils.equals(item.getIdentifyingValue(), searchItem.getIdentifyingValue())) {
return item;
}
}
// Did not find item with same ID. Try to guess duplicates based on other metadata.
for (FeedItem item : feed.getItems()) {
boolean isDuplicate = false;
return null;
}
/**
* Guess if one of the items could actually mean the searched item, even if it uses another identifying value.
* This is to work around podcasters breaking their GUIDs.
*/
private static FeedItem searchFeedItemGuessDuplicate(List<FeedItem> items, FeedItem searchItem) {
for (FeedItem item : items) {
if ((item.getMedia() != null)
&& (searchItem.getMedia() != null)
&& !TextUtils.isEmpty(item.getMedia().getStreamUrl())
&& !TextUtils.isEmpty(searchItem.getMedia().getStreamUrl())
&& TextUtils.equals(item.getMedia().getStreamUrl(), searchItem.getMedia().getStreamUrl())) {
Log.d(TAG, "Removing duplicate episode stream url " + item.getMedia().getStreamUrl());
isDuplicate = true;
} else if (TextUtils.equals(item.getTitle(), searchItem.getTitle())) {
Log.d(TAG, "Found same title. Checking pubdate: " + item.getTitle());
return item;
} else if (titlesLookSimilar(item.getTitle(), searchItem.getTitle())) {
long dateOriginal = item.getPubDate().getTime();
long dateNew = searchItem.getPubDate() == null ? 0 : searchItem.getPubDate().getTime();
if (Math.abs(dateOriginal - dateNew) < 24L * 3600L * 1000L) { // Same day
Log.d(TAG, "Same pubDate. Removing. " + item.getPubDate() + ", " + searchItem.getPubDate());
isDuplicate = true;
if (Math.abs(dateOriginal - dateNew) < 7L * 24L * 3600L * 1000L) { // Same week
return item;
}
}
if (isDuplicate) {
DBWriter.addDownloadStatus(new DownloadStatus(feed,
searchItem.getTitle(), DownloadError.ERROR_PARSER_EXCEPTION, false,
"The podcast host changed the ID of an existing episode instead of just "
+ "updating the episode itself. AntennaPod attempted to repair it.\n\n"
+ "{" + item.getTitle() + "} with ID " + item.getItemIdentifier()
+ " seems to be the same as {" + searchItem.getTitle() + "} with ID "
+ searchItem.getItemIdentifier(), false));
item.setItemIdentifier(searchItem.getItemIdentifier());
if (item.isPlayed() && item.getMedia() != null) {
EpisodeAction action = new EpisodeAction.Builder(item, EpisodeAction.PLAY)
.currentTimestamp()
.started(item.getMedia().getDuration() / 1000)
.position(item.getMedia().getDuration() / 1000)
.total(item.getMedia().getDuration() / 1000)
.build();
SyncService.enqueueEpisodeAction(context, action);
}
return item;
}
}
return null;
}
private static boolean titlesLookSimilar(String title1, String title2) {
if (TextUtils.isEmpty(title1) || TextUtils.isEmpty(title2)) {
return false;
}
return canonicalizeTitle(title1).equals(canonicalizeTitle(title2));
}
private static String canonicalizeTitle(String title) {
return title
.trim()
.replace('“', '"')
.replace('”', '"')
.replace('„', '"')
.replace('—', '-');
}
/**
* Adds new Feeds to the database or updates the old versions if they already exists. If another Feed with the same
* identifying value already exists, this method will add new FeedItems from the new Feed to the existing Feed.
@ -454,8 +451,45 @@ public final class DBTasks {
// Look for new or updated Items
for (int idx = 0; idx < newFeed.getItems().size(); idx++) {
final FeedItem item = newFeed.getItems().get(idx);
FeedItem oldItem = searchFeedItemByIdentifyingValue(context, savedFeed, item);
if (item != searchFeedItemGuessDuplicate(newFeed.getItems(), item)) {
// Canonical episode is the first one returned (usually oldest)
DBWriter.addDownloadStatus(new DownloadStatus(savedFeed,
item.getTitle(), DownloadError.ERROR_PARSER_EXCEPTION_DUPLICATE, false,
"The podcast host appears to have added the same episode twice. "
+ "AntennaPod attempted to repair it.", false));
continue;
}
FeedItem oldItem = searchFeedItemByIdentifyingValue(savedFeed.getItems(), item);
if (oldItem == null) {
oldItem = searchFeedItemGuessDuplicate(savedFeed.getItems(), item);
if (oldItem != null) {
Log.d(TAG, "Repaired duplicate: " + oldItem + ", " + item);
DBWriter.addDownloadStatus(new DownloadStatus(savedFeed,
item.getTitle(), DownloadError.ERROR_PARSER_EXCEPTION_DUPLICATE, false,
"The podcast host changed the ID of an existing episode instead of just "
+ "updating the episode itself. AntennaPod attempted to repair it.\n\n"
+ "{" + oldItem.getTitle() + "} with ID " + oldItem.getItemIdentifier()
+ " seems to be the same as {" + item.getTitle() + "} with ID "
+ item.getItemIdentifier(), false));
oldItem.setItemIdentifier(item.getItemIdentifier());
if (oldItem.isPlayed() && oldItem.getMedia() != null) {
EpisodeAction action = new EpisodeAction.Builder(oldItem, EpisodeAction.PLAY)
.currentTimestamp()
.started(oldItem.getMedia().getDuration() / 1000)
.position(oldItem.getMedia().getDuration() / 1000)
.total(oldItem.getMedia().getDuration() / 1000)
.build();
SyncService.enqueueEpisodeAction(context, action);
}
}
}
if (oldItem != null) {
oldItem.updateFromOther(item);
} else {
// item is new
item.setFeed(savedFeed);
@ -477,8 +511,6 @@ public final class DBTasks {
+ " new, prior most recent date = " + priorMostRecentDate);
item.setNew();
}
} else {
oldItem.updateFromOther(item);
}
}
@ -487,7 +519,7 @@ public final class DBTasks {
Iterator<FeedItem> it = savedFeed.getItems().iterator();
while (it.hasNext()) {
FeedItem feedItem = it.next();
if (searchFeedItemByIdentifyingValue(context, newFeed, feedItem) == null) {
if (searchFeedItemByIdentifyingValue(newFeed.getItems(), feedItem) == null) {
unlistedItems.add(feedItem);
it.remove();
}

View File

@ -27,7 +27,8 @@ public enum DownloadError {
ERROR_IO_BLOCKED(18, R.string.download_error_blocked),
ERROR_UNSUPPORTED_TYPE_HTML(19, R.string.download_error_unsupported_type_html),
ERROR_NOT_FOUND(20, R.string.download_error_not_found),
ERROR_CERTIFICATE(21, R.string.download_error_certificate);
ERROR_CERTIFICATE(21, R.string.download_error_certificate),
ERROR_PARSER_EXCEPTION_DUPLICATE(22, R.string.download_error_parser_exception);
private final int code;
private final int resId;

View File

@ -8,6 +8,7 @@
<color name="medium_gray">#afafaf</color>
<color name="black">#000000</color>
<color name="download_success_green">#248800</color>
<color name="download_warning_yellow">#F59F00</color>
<color name="download_failed_red">#B00020</color>
<color name="image_readability_tint">#80000000</color>
<color name="feed_image_bg">#50000000</color>

View File

@ -1,8 +1,5 @@
package de.danoeh.antennapod.parser.feed;
import android.text.TextUtils;
import android.util.Log;
import de.danoeh.antennapod.parser.feed.util.TypeGetter;
import org.apache.commons.io.input.XmlStreamReader;
import org.xml.sax.InputSource;
@ -11,22 +8,14 @@ import org.xml.sax.SAXException;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import de.danoeh.antennapod.model.feed.Feed;
import de.danoeh.antennapod.model.feed.FeedItem;
public class FeedHandler {
private static final String TAG = "FeedHandler";
public FeedHandlerResult parseFeed(Feed feed) throws SAXException, IOException,
ParserConfigurationException, UnsupportedFeedtypeException {
TypeGetter tg = new TypeGetter();
@ -42,50 +31,6 @@ public class FeedHandler {
saxParser.parse(inputSource, handler);
inputStreamReader.close();
feed.setItems(dedupItems(feed.getItems()));
return new FeedHandlerResult(handler.state.feed, handler.state.alternateUrls);
}
/**
* For updating items that are stored in the database, see also: DBTasks.searchFeedItemByIdentifyingValue
*/
public static List<FeedItem> dedupItems(List<FeedItem> items) {
if (items == null) {
return null;
}
List<FeedItem> list = new ArrayList<>(items);
Set<String> seen = new HashSet<>();
Iterator<FeedItem> it = list.iterator();
while (it.hasNext()) {
FeedItem item = it.next();
if (!TextUtils.isEmpty(item.getItemIdentifier()) && seen.contains(item.getItemIdentifier())) {
Log.d(TAG, "Removing duplicate episode guid " + item.getItemIdentifier());
it.remove();
continue;
}
if (item.getMedia() == null || TextUtils.isEmpty(item.getMedia().getStreamUrl())) {
continue;
}
if (seen.contains(item.getMedia().getStreamUrl())) {
Log.d(TAG, "Removing duplicate episode stream url " + item.getMedia().getStreamUrl());
it.remove();
} else {
seen.add(item.getMedia().getStreamUrl());
if (TextUtils.isEmpty(item.getTitle()) || item.getPubDate() == null) {
continue;
}
if (!seen.contains(item.getTitle() + item.getPubDate().toString())) {
seen.add(item.getTitle() + item.getPubDate().toString());
} else {
Log.d(TAG, "Removing duplicate episode title and pubDate "
+ item.getTitle()
+ " " + item.getPubDate());
it.remove();
}
}
seen.add(item.getItemIdentifier());
}
return list;
}
}