Improvements related to duplicate detection (#5387)
* Move duplicate detection to one single place * Canonicalize some common characters that are often confused * Assume same episode even when date is off by 1 week * Display duplicate detection as warning, not error
This commit is contained in:
parent
b9f578ed5c
commit
b36cdb0c4e
|
@ -21,6 +21,7 @@ import de.danoeh.antennapod.core.storage.DBReader;
|
|||
import de.danoeh.antennapod.core.storage.DBTasks;
|
||||
import de.danoeh.antennapod.core.storage.DownloadRequestException;
|
||||
import de.danoeh.antennapod.core.storage.DownloadRequester;
|
||||
import de.danoeh.antennapod.core.util.DownloadError;
|
||||
import de.danoeh.antennapod.model.feed.Feed;
|
||||
import de.danoeh.antennapod.model.feed.FeedMedia;
|
||||
import de.danoeh.antennapod.ui.common.ThemeUtils;
|
||||
|
@ -101,8 +102,13 @@ public class DownloadLogAdapter extends BaseAdapter {
|
|||
holder.reason.setVisibility(View.GONE);
|
||||
holder.tapForDetails.setVisibility(View.GONE);
|
||||
} else {
|
||||
holder.icon.setTextColor(ContextCompat.getColor(context, R.color.download_failed_red));
|
||||
holder.icon.setText("{fa-times-circle}");
|
||||
if (status.getReason() == DownloadError.ERROR_PARSER_EXCEPTION_DUPLICATE) {
|
||||
holder.icon.setTextColor(ContextCompat.getColor(context, R.color.download_warning_yellow));
|
||||
holder.icon.setText("{fa-exclamation-circle}");
|
||||
} else {
|
||||
holder.icon.setTextColor(ContextCompat.getColor(context, R.color.download_failed_red));
|
||||
holder.icon.setText("{fa-times-circle}");
|
||||
}
|
||||
holder.icon.setContentDescription(context.getString(R.string.error_label));
|
||||
holder.reason.setText(status.getReason().getErrorString(context));
|
||||
holder.reason.setVisibility(View.VISIBLE);
|
||||
|
|
|
@ -335,59 +335,56 @@ public final class DBTasks {
|
|||
}
|
||||
|
||||
/**
|
||||
* Get a FeedItem by its identifying value or download_url.
|
||||
* For de-duplicating items that are not stored yet, see also FeedHandler.dedupItems
|
||||
* Get a FeedItem by its identifying value.
|
||||
*/
|
||||
private static FeedItem searchFeedItemByIdentifyingValue(Context context, Feed feed, FeedItem searchItem) {
|
||||
for (FeedItem item : feed.getItems()) {
|
||||
private static FeedItem searchFeedItemByIdentifyingValue(List<FeedItem> items, FeedItem searchItem) {
|
||||
for (FeedItem item : items) {
|
||||
if (TextUtils.equals(item.getIdentifyingValue(), searchItem.getIdentifyingValue())) {
|
||||
return item;
|
||||
}
|
||||
}
|
||||
// Did not find item with same ID. Try to guess duplicates based on other metadata.
|
||||
for (FeedItem item : feed.getItems()) {
|
||||
boolean isDuplicate = false;
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Guess if one of the items could actually mean the searched item, even if it uses another identifying value.
|
||||
* This is to work around podcasters breaking their GUIDs.
|
||||
*/
|
||||
private static FeedItem searchFeedItemGuessDuplicate(List<FeedItem> items, FeedItem searchItem) {
|
||||
for (FeedItem item : items) {
|
||||
if ((item.getMedia() != null)
|
||||
&& (searchItem.getMedia() != null)
|
||||
&& !TextUtils.isEmpty(item.getMedia().getStreamUrl())
|
||||
&& !TextUtils.isEmpty(searchItem.getMedia().getStreamUrl())
|
||||
&& TextUtils.equals(item.getMedia().getStreamUrl(), searchItem.getMedia().getStreamUrl())) {
|
||||
Log.d(TAG, "Removing duplicate episode stream url " + item.getMedia().getStreamUrl());
|
||||
isDuplicate = true;
|
||||
} else if (TextUtils.equals(item.getTitle(), searchItem.getTitle())) {
|
||||
Log.d(TAG, "Found same title. Checking pubdate: " + item.getTitle());
|
||||
return item;
|
||||
} else if (titlesLookSimilar(item.getTitle(), searchItem.getTitle())) {
|
||||
long dateOriginal = item.getPubDate().getTime();
|
||||
long dateNew = searchItem.getPubDate() == null ? 0 : searchItem.getPubDate().getTime();
|
||||
if (Math.abs(dateOriginal - dateNew) < 24L * 3600L * 1000L) { // Same day
|
||||
Log.d(TAG, "Same pubDate. Removing. " + item.getPubDate() + ", " + searchItem.getPubDate());
|
||||
isDuplicate = true;
|
||||
if (Math.abs(dateOriginal - dateNew) < 7L * 24L * 3600L * 1000L) { // Same week
|
||||
return item;
|
||||
}
|
||||
}
|
||||
if (isDuplicate) {
|
||||
DBWriter.addDownloadStatus(new DownloadStatus(feed,
|
||||
searchItem.getTitle(), DownloadError.ERROR_PARSER_EXCEPTION, false,
|
||||
"The podcast host changed the ID of an existing episode instead of just "
|
||||
+ "updating the episode itself. AntennaPod attempted to repair it.\n\n"
|
||||
+ "{" + item.getTitle() + "} with ID " + item.getItemIdentifier()
|
||||
+ " seems to be the same as {" + searchItem.getTitle() + "} with ID "
|
||||
+ searchItem.getItemIdentifier(), false));
|
||||
item.setItemIdentifier(searchItem.getItemIdentifier());
|
||||
|
||||
if (item.isPlayed() && item.getMedia() != null) {
|
||||
EpisodeAction action = new EpisodeAction.Builder(item, EpisodeAction.PLAY)
|
||||
.currentTimestamp()
|
||||
.started(item.getMedia().getDuration() / 1000)
|
||||
.position(item.getMedia().getDuration() / 1000)
|
||||
.total(item.getMedia().getDuration() / 1000)
|
||||
.build();
|
||||
SyncService.enqueueEpisodeAction(context, action);
|
||||
}
|
||||
return item;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private static boolean titlesLookSimilar(String title1, String title2) {
|
||||
if (TextUtils.isEmpty(title1) || TextUtils.isEmpty(title2)) {
|
||||
return false;
|
||||
}
|
||||
return canonicalizeTitle(title1).equals(canonicalizeTitle(title2));
|
||||
}
|
||||
|
||||
private static String canonicalizeTitle(String title) {
|
||||
return title
|
||||
.trim()
|
||||
.replace('“', '"')
|
||||
.replace('”', '"')
|
||||
.replace('„', '"')
|
||||
.replace('—', '-');
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds new Feeds to the database or updates the old versions if they already exists. If another Feed with the same
|
||||
* identifying value already exists, this method will add new FeedItems from the new Feed to the existing Feed.
|
||||
|
@ -454,8 +451,45 @@ public final class DBTasks {
|
|||
// Look for new or updated Items
|
||||
for (int idx = 0; idx < newFeed.getItems().size(); idx++) {
|
||||
final FeedItem item = newFeed.getItems().get(idx);
|
||||
FeedItem oldItem = searchFeedItemByIdentifyingValue(context, savedFeed, item);
|
||||
|
||||
if (item != searchFeedItemGuessDuplicate(newFeed.getItems(), item)) {
|
||||
// Canonical episode is the first one returned (usually oldest)
|
||||
DBWriter.addDownloadStatus(new DownloadStatus(savedFeed,
|
||||
item.getTitle(), DownloadError.ERROR_PARSER_EXCEPTION_DUPLICATE, false,
|
||||
"The podcast host appears to have added the same episode twice. "
|
||||
+ "AntennaPod attempted to repair it.", false));
|
||||
continue;
|
||||
}
|
||||
|
||||
FeedItem oldItem = searchFeedItemByIdentifyingValue(savedFeed.getItems(), item);
|
||||
if (oldItem == null) {
|
||||
oldItem = searchFeedItemGuessDuplicate(savedFeed.getItems(), item);
|
||||
if (oldItem != null) {
|
||||
Log.d(TAG, "Repaired duplicate: " + oldItem + ", " + item);
|
||||
DBWriter.addDownloadStatus(new DownloadStatus(savedFeed,
|
||||
item.getTitle(), DownloadError.ERROR_PARSER_EXCEPTION_DUPLICATE, false,
|
||||
"The podcast host changed the ID of an existing episode instead of just "
|
||||
+ "updating the episode itself. AntennaPod attempted to repair it.\n\n"
|
||||
+ "{" + oldItem.getTitle() + "} with ID " + oldItem.getItemIdentifier()
|
||||
+ " seems to be the same as {" + item.getTitle() + "} with ID "
|
||||
+ item.getItemIdentifier(), false));
|
||||
oldItem.setItemIdentifier(item.getItemIdentifier());
|
||||
|
||||
if (oldItem.isPlayed() && oldItem.getMedia() != null) {
|
||||
EpisodeAction action = new EpisodeAction.Builder(oldItem, EpisodeAction.PLAY)
|
||||
.currentTimestamp()
|
||||
.started(oldItem.getMedia().getDuration() / 1000)
|
||||
.position(oldItem.getMedia().getDuration() / 1000)
|
||||
.total(oldItem.getMedia().getDuration() / 1000)
|
||||
.build();
|
||||
SyncService.enqueueEpisodeAction(context, action);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (oldItem != null) {
|
||||
oldItem.updateFromOther(item);
|
||||
} else {
|
||||
// item is new
|
||||
item.setFeed(savedFeed);
|
||||
|
||||
|
@ -477,8 +511,6 @@ public final class DBTasks {
|
|||
+ " new, prior most recent date = " + priorMostRecentDate);
|
||||
item.setNew();
|
||||
}
|
||||
} else {
|
||||
oldItem.updateFromOther(item);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -487,7 +519,7 @@ public final class DBTasks {
|
|||
Iterator<FeedItem> it = savedFeed.getItems().iterator();
|
||||
while (it.hasNext()) {
|
||||
FeedItem feedItem = it.next();
|
||||
if (searchFeedItemByIdentifyingValue(context, newFeed, feedItem) == null) {
|
||||
if (searchFeedItemByIdentifyingValue(newFeed.getItems(), feedItem) == null) {
|
||||
unlistedItems.add(feedItem);
|
||||
it.remove();
|
||||
}
|
||||
|
|
|
@ -27,7 +27,8 @@ public enum DownloadError {
|
|||
ERROR_IO_BLOCKED(18, R.string.download_error_blocked),
|
||||
ERROR_UNSUPPORTED_TYPE_HTML(19, R.string.download_error_unsupported_type_html),
|
||||
ERROR_NOT_FOUND(20, R.string.download_error_not_found),
|
||||
ERROR_CERTIFICATE(21, R.string.download_error_certificate);
|
||||
ERROR_CERTIFICATE(21, R.string.download_error_certificate),
|
||||
ERROR_PARSER_EXCEPTION_DUPLICATE(22, R.string.download_error_parser_exception);
|
||||
|
||||
private final int code;
|
||||
private final int resId;
|
||||
|
|
|
@ -8,6 +8,7 @@
|
|||
<color name="medium_gray">#afafaf</color>
|
||||
<color name="black">#000000</color>
|
||||
<color name="download_success_green">#248800</color>
|
||||
<color name="download_warning_yellow">#F59F00</color>
|
||||
<color name="download_failed_red">#B00020</color>
|
||||
<color name="image_readability_tint">#80000000</color>
|
||||
<color name="feed_image_bg">#50000000</color>
|
||||
|
|
|
@ -1,8 +1,5 @@
|
|||
package de.danoeh.antennapod.parser.feed;
|
||||
|
||||
import android.text.TextUtils;
|
||||
import android.util.Log;
|
||||
|
||||
import de.danoeh.antennapod.parser.feed.util.TypeGetter;
|
||||
import org.apache.commons.io.input.XmlStreamReader;
|
||||
import org.xml.sax.InputSource;
|
||||
|
@ -11,22 +8,14 @@ import org.xml.sax.SAXException;
|
|||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import javax.xml.parsers.SAXParser;
|
||||
import javax.xml.parsers.SAXParserFactory;
|
||||
|
||||
import de.danoeh.antennapod.model.feed.Feed;
|
||||
import de.danoeh.antennapod.model.feed.FeedItem;
|
||||
|
||||
public class FeedHandler {
|
||||
private static final String TAG = "FeedHandler";
|
||||
|
||||
public FeedHandlerResult parseFeed(Feed feed) throws SAXException, IOException,
|
||||
ParserConfigurationException, UnsupportedFeedtypeException {
|
||||
TypeGetter tg = new TypeGetter();
|
||||
|
@ -42,50 +31,6 @@ public class FeedHandler {
|
|||
|
||||
saxParser.parse(inputSource, handler);
|
||||
inputStreamReader.close();
|
||||
feed.setItems(dedupItems(feed.getItems()));
|
||||
return new FeedHandlerResult(handler.state.feed, handler.state.alternateUrls);
|
||||
}
|
||||
|
||||
/**
|
||||
* For updating items that are stored in the database, see also: DBTasks.searchFeedItemByIdentifyingValue
|
||||
*/
|
||||
public static List<FeedItem> dedupItems(List<FeedItem> items) {
|
||||
if (items == null) {
|
||||
return null;
|
||||
}
|
||||
List<FeedItem> list = new ArrayList<>(items);
|
||||
Set<String> seen = new HashSet<>();
|
||||
Iterator<FeedItem> it = list.iterator();
|
||||
while (it.hasNext()) {
|
||||
FeedItem item = it.next();
|
||||
if (!TextUtils.isEmpty(item.getItemIdentifier()) && seen.contains(item.getItemIdentifier())) {
|
||||
Log.d(TAG, "Removing duplicate episode guid " + item.getItemIdentifier());
|
||||
it.remove();
|
||||
continue;
|
||||
}
|
||||
|
||||
if (item.getMedia() == null || TextUtils.isEmpty(item.getMedia().getStreamUrl())) {
|
||||
continue;
|
||||
}
|
||||
if (seen.contains(item.getMedia().getStreamUrl())) {
|
||||
Log.d(TAG, "Removing duplicate episode stream url " + item.getMedia().getStreamUrl());
|
||||
it.remove();
|
||||
} else {
|
||||
seen.add(item.getMedia().getStreamUrl());
|
||||
if (TextUtils.isEmpty(item.getTitle()) || item.getPubDate() == null) {
|
||||
continue;
|
||||
}
|
||||
if (!seen.contains(item.getTitle() + item.getPubDate().toString())) {
|
||||
seen.add(item.getTitle() + item.getPubDate().toString());
|
||||
} else {
|
||||
Log.d(TAG, "Removing duplicate episode title and pubDate "
|
||||
+ item.getTitle()
|
||||
+ " " + item.getPubDate());
|
||||
it.remove();
|
||||
}
|
||||
}
|
||||
seen.add(item.getItemIdentifier());
|
||||
}
|
||||
return list;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue