Dedup based on item unique id, media url or title (#4839)

This commit is contained in:
Tony Tam 2021-07-11 00:58:54 -07:00 committed by GitHub
parent b4558efe4a
commit 81ea42a2a4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 134 additions and 23 deletions

View File

@ -334,11 +334,36 @@ public final class DBTasks {
}
/**
* Get a FeedItem by its identifying value.
* Get a FeedItem by its identifying value or download_url.
* For de-duplicating items that are not stored yet, see also FeedHandler.dedupItems
*/
private static FeedItem searchFeedItemByIdentifyingValue(Feed feed, String identifier) {
private static FeedItem searchFeedItemByIdentifyingValue(Feed feed, FeedItem searchItem) {
for (FeedItem item : feed.getItems()) {
if (TextUtils.equals(item.getIdentifyingValue(), identifier)) {
if (TextUtils.equals(item.getIdentifyingValue(), searchItem.getIdentifyingValue())) {
return item;
}
}
// Did not find item with same ID. Try to guess duplicates based on other metadata.
for (FeedItem item : feed.getItems()) {
if (item.getMedia() == null || TextUtils.isEmpty(item.getMedia().getStreamUrl())) {
continue;
}
boolean isDuplicate = false;
if (TextUtils.equals(item.getMedia().getStreamUrl(), searchItem.getMedia().getStreamUrl())) {
Log.d(TAG, "Removing duplicate episode stream url " + item.getMedia().getStreamUrl());
isDuplicate = true;
} else if (TextUtils.equals(item.getTitle(), searchItem.getTitle())
&& item.getPubDate().equals(searchItem.getPubDate())) {
Log.d(TAG, "Removing duplicate episode title + pubDate " + item.getTitle() + " " + item.getPubDate());
isDuplicate = true;
}
if (isDuplicate) {
DBWriter.addDownloadStatus(new DownloadStatus(feed,
searchItem.getTitle(), DownloadError.ERROR_PARSER_EXCEPTION, false,
"The podcast host changed the ID of an existing episode instead of just "
+ "updating the episode itself. AntennaPod attempted to repair it.", false));
item.setItemIdentifier(searchItem.getItemIdentifier());
return item;
}
}
@ -411,7 +436,7 @@ public final class DBTasks {
// Look for new or updated Items
for (int idx = 0; idx < newFeed.getItems().size(); idx++) {
final FeedItem item = newFeed.getItems().get(idx);
FeedItem oldItem = searchFeedItemByIdentifyingValue(savedFeed, item.getIdentifyingValue());
FeedItem oldItem = searchFeedItemByIdentifyingValue(savedFeed, item);
if (oldItem == null) {
// item is new
item.setFeed(savedFeed);
@ -445,7 +470,7 @@ public final class DBTasks {
Iterator<FeedItem> it = savedFeed.getItems().iterator();
while (it.hasNext()) {
FeedItem feedItem = it.next();
if (searchFeedItemByIdentifyingValue(newFeed, feedItem.getIdentifyingValue()) == null) {
if (searchFeedItemByIdentifyingValue(newFeed, feedItem) == null) {
unlistedItems.add(feedItem);
it.remove();
}

View File

@ -1,5 +1,8 @@
package de.danoeh.antennapod.core.syndication.handler;
import android.text.TextUtils;
import android.util.Log;
import org.apache.commons.io.input.XmlStreamReader;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
@ -7,30 +10,81 @@ import org.xml.sax.SAXException;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import de.danoeh.antennapod.model.feed.Feed;
import de.danoeh.antennapod.model.feed.FeedItem;
public class FeedHandler {
private static final String TAG = "FeedHandler";
public FeedHandlerResult parseFeed(Feed feed) throws SAXException, IOException,
ParserConfigurationException, UnsupportedFeedtypeException {
TypeGetter tg = new TypeGetter();
TypeGetter.Type type = tg.getType(feed);
SyndHandler handler = new SyndHandler(feed, type);
public FeedHandlerResult parseFeed(Feed feed) throws SAXException, IOException,
ParserConfigurationException, UnsupportedFeedtypeException {
TypeGetter tg = new TypeGetter();
TypeGetter.Type type = tg.getType(feed);
SyndHandler handler = new SyndHandler(feed, type);
SAXParserFactory factory = SAXParserFactory.newInstance();
factory.setNamespaceAware(true);
SAXParser saxParser = factory.newSAXParser();
File file = new File(feed.getFile_url());
Reader inputStreamReader = new XmlStreamReader(file);
InputSource inputSource = new InputSource(inputStreamReader);
SAXParserFactory factory = SAXParserFactory.newInstance();
factory.setNamespaceAware(true);
SAXParser saxParser = factory.newSAXParser();
File file = new File(feed.getFile_url());
Reader inputStreamReader = new XmlStreamReader(file);
InputSource inputSource = new InputSource(inputStreamReader);
saxParser.parse(inputSource, handler);
inputStreamReader.close();
return new FeedHandlerResult(handler.state.feed, handler.state.alternateUrls);
}
saxParser.parse(inputSource, handler);
inputStreamReader.close();
feed.setItems(dedupItems(feed.getItems()));
return new FeedHandlerResult(handler.state.feed, handler.state.alternateUrls);
}
/**
* For updating items that are stored in the database, see also: DBTasks.searchFeedItemByIdentifyingValue
*/
public static List<FeedItem> dedupItems(List<FeedItem> items) {
if (items == null) {
return null;
}
List<FeedItem> list = new ArrayList<>(items);
Set<String> seen = new HashSet<>();
Iterator<FeedItem> it = list.iterator();
while (it.hasNext()) {
FeedItem item = it.next();
if (seen.contains(item.getItemIdentifier())) {
Log.d(TAG, "Removing duplicate episode guid " + item.getItemIdentifier());
it.remove();
continue;
}
if (item.getMedia() == null || TextUtils.isEmpty(item.getMedia().getStreamUrl())) {
continue;
}
if (seen.contains(item.getMedia().getStreamUrl())) {
Log.d(TAG, "Removing duplicate episode stream url " + item.getMedia().getStreamUrl());
it.remove();
} else {
seen.add(item.getMedia().getStreamUrl());
if (TextUtils.isEmpty(item.getTitle()) || TextUtils.isEmpty(item.getPubDate().toString())) {
continue;
}
if (!seen.contains(item.getTitle() + item.getPubDate().toString())) {
seen.add(item.getTitle() + item.getPubDate().toString());
} else {
Log.d(TAG, "Removing duplicate episode title and pubDate "
+ item.getTitle()
+ " " + item.getPubDate());
it.remove();
}
}
seen.add(item.getItemIdentifier());
}
return list;
}
}

View File

@ -134,7 +134,7 @@ public class DbCleanupTests {
if (itemState == FeedItem.PLAYED) {
playbackCompletionDate = itemDate;
}
FeedItem item = new FeedItem(0, "title", "id", "link", itemDate, itemState, feed);
FeedItem item = new FeedItem(0, "title", "id" + i, "link", itemDate, itemState, feed);
File f = new File(destFolder, "file " + i);
assertTrue(f.createNewFile());

View File

@ -92,7 +92,7 @@ public class DbNullCleanupAlgorithmTest {
feed.setItems(items);
List<File> files = new ArrayList<>();
for (int i = 0; i < numItems; i++) {
FeedItem item = new FeedItem(0, "title", "id", "link", new Date(), FeedItem.PLAYED, feed);
FeedItem item = new FeedItem(0, "title", "id" + i, "link", new Date(), FeedItem.PLAYED, feed);
File f = new File(destFolder, "file " + i);
assertTrue(f.createNewFile());

View File

@ -197,6 +197,38 @@ public class DbTasksTest {
assertEquals(8, feedFromDB.getItems().size()); // 10 - 2 = 8 items
}
@Test
public void testUpdateFeedSetDuplicate() {
final Feed feed = new Feed("url", null, "title");
feed.setItems(new ArrayList<>());
for (int i = 0; i < 10; i++) {
FeedItem item =
new FeedItem(0, "item " + i, "id " + i, "link " + i, new Date(i), FeedItem.PLAYED, feed);
FeedMedia media = new FeedMedia(item, "download url " + i, 123, "media/mp3");
item.setMedia(media);
feed.getItems().add(item);
}
PodDBAdapter adapter = PodDBAdapter.getInstance();
adapter.open();
adapter.setCompleteFeed(feed);
adapter.close();
// change the guid of the first item, but leave the download url the same
FeedItem item = feed.getItemAtIndex(0);
item.setItemIdentifier("id 0-duplicate");
item.setTitle("item 0 duplicate");
Feed newFeed = DBTasks.updateFeed(context, feed, false);
assertEquals(10, newFeed.getItems().size()); // id 1-duplicate replaces because the stream url is the same
Feed feedFromDB = DBReader.getFeed(newFeed.getId());
assertEquals(10, feedFromDB.getItems().size()); // id1-duplicate should override id 1
FeedItem updatedItem = feedFromDB.getItemAtIndex(9);
assertEquals("item 0 duplicate", updatedItem.getTitle());
assertEquals("id 0-duplicate", updatedItem.getItemIdentifier()); // Should use the new ID for sync etc
}
@SuppressWarnings("SameParameterValue")
private void updatedFeedTest(final Feed newFeed, long feedID, List<Long> itemIDs,
int numItemsOld, int numItemsNew) {
@ -285,7 +317,7 @@ public class DbTasksTest {
if (numFeedItems > 0) {
List<FeedItem> items = new ArrayList<>(numFeedItems);
for (int i = 1; i <= numFeedItems; i++) {
FeedItem item = new FeedItem(0, "item " + i + " of " + title, "id", "link",
FeedItem item = new FeedItem(0, "item " + i + " of " + title, "id" + title + i, "link",
new Date(), FeedItem.UNPLAYED, feed);
items.add(item);
}