Dedup based on item unique id, media url or title (#4839)
This commit is contained in:
parent
b4558efe4a
commit
81ea42a2a4
|
@ -334,11 +334,36 @@ public final class DBTasks {
|
|||
}
|
||||
|
||||
/**
|
||||
* Get a FeedItem by its identifying value.
|
||||
* Get a FeedItem by its identifying value or download_url.
|
||||
* For de-duplicating items that are not stored yet, see also FeedHandler.dedupItems
|
||||
*/
|
||||
private static FeedItem searchFeedItemByIdentifyingValue(Feed feed, String identifier) {
|
||||
private static FeedItem searchFeedItemByIdentifyingValue(Feed feed, FeedItem searchItem) {
|
||||
for (FeedItem item : feed.getItems()) {
|
||||
if (TextUtils.equals(item.getIdentifyingValue(), identifier)) {
|
||||
if (TextUtils.equals(item.getIdentifyingValue(), searchItem.getIdentifyingValue())) {
|
||||
return item;
|
||||
}
|
||||
}
|
||||
// Did not find item with same ID. Try to guess duplicates based on other metadata.
|
||||
for (FeedItem item : feed.getItems()) {
|
||||
if (item.getMedia() == null || TextUtils.isEmpty(item.getMedia().getStreamUrl())) {
|
||||
continue;
|
||||
}
|
||||
|
||||
boolean isDuplicate = false;
|
||||
if (TextUtils.equals(item.getMedia().getStreamUrl(), searchItem.getMedia().getStreamUrl())) {
|
||||
Log.d(TAG, "Removing duplicate episode stream url " + item.getMedia().getStreamUrl());
|
||||
isDuplicate = true;
|
||||
} else if (TextUtils.equals(item.getTitle(), searchItem.getTitle())
|
||||
&& item.getPubDate().equals(searchItem.getPubDate())) {
|
||||
Log.d(TAG, "Removing duplicate episode title + pubDate " + item.getTitle() + " " + item.getPubDate());
|
||||
isDuplicate = true;
|
||||
}
|
||||
if (isDuplicate) {
|
||||
DBWriter.addDownloadStatus(new DownloadStatus(feed,
|
||||
searchItem.getTitle(), DownloadError.ERROR_PARSER_EXCEPTION, false,
|
||||
"The podcast host changed the ID of an existing episode instead of just "
|
||||
+ "updating the episode itself. AntennaPod attempted to repair it.", false));
|
||||
item.setItemIdentifier(searchItem.getItemIdentifier());
|
||||
return item;
|
||||
}
|
||||
}
|
||||
|
@ -411,7 +436,7 @@ public final class DBTasks {
|
|||
// Look for new or updated Items
|
||||
for (int idx = 0; idx < newFeed.getItems().size(); idx++) {
|
||||
final FeedItem item = newFeed.getItems().get(idx);
|
||||
FeedItem oldItem = searchFeedItemByIdentifyingValue(savedFeed, item.getIdentifyingValue());
|
||||
FeedItem oldItem = searchFeedItemByIdentifyingValue(savedFeed, item);
|
||||
if (oldItem == null) {
|
||||
// item is new
|
||||
item.setFeed(savedFeed);
|
||||
|
@ -445,7 +470,7 @@ public final class DBTasks {
|
|||
Iterator<FeedItem> it = savedFeed.getItems().iterator();
|
||||
while (it.hasNext()) {
|
||||
FeedItem feedItem = it.next();
|
||||
if (searchFeedItemByIdentifyingValue(newFeed, feedItem.getIdentifyingValue()) == null) {
|
||||
if (searchFeedItemByIdentifyingValue(newFeed, feedItem) == null) {
|
||||
unlistedItems.add(feedItem);
|
||||
it.remove();
|
||||
}
|
||||
|
|
|
@ -1,5 +1,8 @@
|
|||
package de.danoeh.antennapod.core.syndication.handler;
|
||||
|
||||
import android.text.TextUtils;
|
||||
import android.util.Log;
|
||||
|
||||
import org.apache.commons.io.input.XmlStreamReader;
|
||||
import org.xml.sax.InputSource;
|
||||
import org.xml.sax.SAXException;
|
||||
|
@ -7,30 +10,81 @@ import org.xml.sax.SAXException;
|
|||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import javax.xml.parsers.SAXParser;
|
||||
import javax.xml.parsers.SAXParserFactory;
|
||||
|
||||
import de.danoeh.antennapod.model.feed.Feed;
|
||||
import de.danoeh.antennapod.model.feed.FeedItem;
|
||||
|
||||
public class FeedHandler {
|
||||
private static final String TAG = "FeedHandler";
|
||||
|
||||
public FeedHandlerResult parseFeed(Feed feed) throws SAXException, IOException,
|
||||
ParserConfigurationException, UnsupportedFeedtypeException {
|
||||
TypeGetter tg = new TypeGetter();
|
||||
TypeGetter.Type type = tg.getType(feed);
|
||||
SyndHandler handler = new SyndHandler(feed, type);
|
||||
public FeedHandlerResult parseFeed(Feed feed) throws SAXException, IOException,
|
||||
ParserConfigurationException, UnsupportedFeedtypeException {
|
||||
TypeGetter tg = new TypeGetter();
|
||||
TypeGetter.Type type = tg.getType(feed);
|
||||
SyndHandler handler = new SyndHandler(feed, type);
|
||||
|
||||
SAXParserFactory factory = SAXParserFactory.newInstance();
|
||||
factory.setNamespaceAware(true);
|
||||
SAXParser saxParser = factory.newSAXParser();
|
||||
File file = new File(feed.getFile_url());
|
||||
Reader inputStreamReader = new XmlStreamReader(file);
|
||||
InputSource inputSource = new InputSource(inputStreamReader);
|
||||
SAXParserFactory factory = SAXParserFactory.newInstance();
|
||||
factory.setNamespaceAware(true);
|
||||
SAXParser saxParser = factory.newSAXParser();
|
||||
File file = new File(feed.getFile_url());
|
||||
Reader inputStreamReader = new XmlStreamReader(file);
|
||||
InputSource inputSource = new InputSource(inputStreamReader);
|
||||
|
||||
saxParser.parse(inputSource, handler);
|
||||
inputStreamReader.close();
|
||||
return new FeedHandlerResult(handler.state.feed, handler.state.alternateUrls);
|
||||
}
|
||||
saxParser.parse(inputSource, handler);
|
||||
inputStreamReader.close();
|
||||
feed.setItems(dedupItems(feed.getItems()));
|
||||
return new FeedHandlerResult(handler.state.feed, handler.state.alternateUrls);
|
||||
}
|
||||
|
||||
/**
|
||||
* For updating items that are stored in the database, see also: DBTasks.searchFeedItemByIdentifyingValue
|
||||
*/
|
||||
public static List<FeedItem> dedupItems(List<FeedItem> items) {
|
||||
if (items == null) {
|
||||
return null;
|
||||
}
|
||||
List<FeedItem> list = new ArrayList<>(items);
|
||||
Set<String> seen = new HashSet<>();
|
||||
Iterator<FeedItem> it = list.iterator();
|
||||
while (it.hasNext()) {
|
||||
FeedItem item = it.next();
|
||||
if (seen.contains(item.getItemIdentifier())) {
|
||||
Log.d(TAG, "Removing duplicate episode guid " + item.getItemIdentifier());
|
||||
it.remove();
|
||||
continue;
|
||||
}
|
||||
|
||||
if (item.getMedia() == null || TextUtils.isEmpty(item.getMedia().getStreamUrl())) {
|
||||
continue;
|
||||
}
|
||||
if (seen.contains(item.getMedia().getStreamUrl())) {
|
||||
Log.d(TAG, "Removing duplicate episode stream url " + item.getMedia().getStreamUrl());
|
||||
it.remove();
|
||||
} else {
|
||||
seen.add(item.getMedia().getStreamUrl());
|
||||
if (TextUtils.isEmpty(item.getTitle()) || TextUtils.isEmpty(item.getPubDate().toString())) {
|
||||
continue;
|
||||
}
|
||||
if (!seen.contains(item.getTitle() + item.getPubDate().toString())) {
|
||||
seen.add(item.getTitle() + item.getPubDate().toString());
|
||||
} else {
|
||||
Log.d(TAG, "Removing duplicate episode title and pubDate "
|
||||
+ item.getTitle()
|
||||
+ " " + item.getPubDate());
|
||||
it.remove();
|
||||
}
|
||||
}
|
||||
seen.add(item.getItemIdentifier());
|
||||
}
|
||||
return list;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -134,7 +134,7 @@ public class DbCleanupTests {
|
|||
if (itemState == FeedItem.PLAYED) {
|
||||
playbackCompletionDate = itemDate;
|
||||
}
|
||||
FeedItem item = new FeedItem(0, "title", "id", "link", itemDate, itemState, feed);
|
||||
FeedItem item = new FeedItem(0, "title", "id" + i, "link", itemDate, itemState, feed);
|
||||
|
||||
File f = new File(destFolder, "file " + i);
|
||||
assertTrue(f.createNewFile());
|
||||
|
|
|
@ -92,7 +92,7 @@ public class DbNullCleanupAlgorithmTest {
|
|||
feed.setItems(items);
|
||||
List<File> files = new ArrayList<>();
|
||||
for (int i = 0; i < numItems; i++) {
|
||||
FeedItem item = new FeedItem(0, "title", "id", "link", new Date(), FeedItem.PLAYED, feed);
|
||||
FeedItem item = new FeedItem(0, "title", "id" + i, "link", new Date(), FeedItem.PLAYED, feed);
|
||||
|
||||
File f = new File(destFolder, "file " + i);
|
||||
assertTrue(f.createNewFile());
|
||||
|
|
|
@ -197,6 +197,38 @@ public class DbTasksTest {
|
|||
assertEquals(8, feedFromDB.getItems().size()); // 10 - 2 = 8 items
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUpdateFeedSetDuplicate() {
|
||||
final Feed feed = new Feed("url", null, "title");
|
||||
feed.setItems(new ArrayList<>());
|
||||
for (int i = 0; i < 10; i++) {
|
||||
FeedItem item =
|
||||
new FeedItem(0, "item " + i, "id " + i, "link " + i, new Date(i), FeedItem.PLAYED, feed);
|
||||
FeedMedia media = new FeedMedia(item, "download url " + i, 123, "media/mp3");
|
||||
item.setMedia(media);
|
||||
feed.getItems().add(item);
|
||||
}
|
||||
PodDBAdapter adapter = PodDBAdapter.getInstance();
|
||||
adapter.open();
|
||||
adapter.setCompleteFeed(feed);
|
||||
adapter.close();
|
||||
|
||||
// change the guid of the first item, but leave the download url the same
|
||||
FeedItem item = feed.getItemAtIndex(0);
|
||||
item.setItemIdentifier("id 0-duplicate");
|
||||
item.setTitle("item 0 duplicate");
|
||||
Feed newFeed = DBTasks.updateFeed(context, feed, false);
|
||||
assertEquals(10, newFeed.getItems().size()); // id 1-duplicate replaces because the stream url is the same
|
||||
|
||||
Feed feedFromDB = DBReader.getFeed(newFeed.getId());
|
||||
assertEquals(10, feedFromDB.getItems().size()); // id1-duplicate should override id 1
|
||||
|
||||
FeedItem updatedItem = feedFromDB.getItemAtIndex(9);
|
||||
assertEquals("item 0 duplicate", updatedItem.getTitle());
|
||||
assertEquals("id 0-duplicate", updatedItem.getItemIdentifier()); // Should use the new ID for sync etc
|
||||
}
|
||||
|
||||
|
||||
@SuppressWarnings("SameParameterValue")
|
||||
private void updatedFeedTest(final Feed newFeed, long feedID, List<Long> itemIDs,
|
||||
int numItemsOld, int numItemsNew) {
|
||||
|
@ -285,7 +317,7 @@ public class DbTasksTest {
|
|||
if (numFeedItems > 0) {
|
||||
List<FeedItem> items = new ArrayList<>(numFeedItems);
|
||||
for (int i = 1; i <= numFeedItems; i++) {
|
||||
FeedItem item = new FeedItem(0, "item " + i + " of " + title, "id", "link",
|
||||
FeedItem item = new FeedItem(0, "item " + i + " of " + title, "id" + title + i, "link",
|
||||
new Date(), FeedItem.UNPLAYED, feed);
|
||||
items.add(item);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue