From dde499f5b1fd9a7c4c15cd4125cdb877d2d11aed Mon Sep 17 00:00:00 2001 From: ByteHamster Date: Tue, 14 Dec 2021 22:11:36 +0100 Subject: [PATCH] Only mark items as duplicates if duration and date are similar --- .../antennapod/core/storage/DBTasks.java | 31 +------- .../storage/FeedItemDuplicateGuesser.java | 70 +++++++++++++++++++ .../storage/FeedItemDuplicateGuesserTest.java | 66 +++++++++++++++++ 3 files changed, 137 insertions(+), 30 deletions(-) create mode 100644 core/src/main/java/de/danoeh/antennapod/core/storage/FeedItemDuplicateGuesser.java create mode 100644 core/src/test/java/de/danoeh/antennapod/core/storage/FeedItemDuplicateGuesserTest.java diff --git a/core/src/main/java/de/danoeh/antennapod/core/storage/DBTasks.java b/core/src/main/java/de/danoeh/antennapod/core/storage/DBTasks.java index 719620202..a0c1e54ad 100644 --- a/core/src/main/java/de/danoeh/antennapod/core/storage/DBTasks.java +++ b/core/src/main/java/de/danoeh/antennapod/core/storage/DBTasks.java @@ -352,42 +352,13 @@ public final class DBTasks { */ private static FeedItem searchFeedItemGuessDuplicate(List items, FeedItem searchItem) { for (FeedItem item : items) { - if ((item.getMedia() != null) - && (searchItem.getMedia() != null) - && !TextUtils.isEmpty(item.getMedia().getStreamUrl()) - && !TextUtils.isEmpty(searchItem.getMedia().getStreamUrl()) - && TextUtils.equals(item.getMedia().getStreamUrl(), searchItem.getMedia().getStreamUrl())) { + if (FeedItemDuplicateGuesser.seemDuplicates(item, searchItem)) { return item; - } else if (titlesLookSimilar(item.getTitle(), searchItem.getTitle())) { - if (searchItem.getPubDate() == null || item.getPubDate() == null) { - continue; - } - long dateOriginal = item.getPubDate().getTime(); - long dateNew = searchItem.getPubDate().getTime(); - if (Math.abs(dateOriginal - dateNew) < 7L * 24L * 3600L * 1000L) { // Same week - return item; - } } } return null; } - private static boolean titlesLookSimilar(String title1, String title2) { - if (TextUtils.isEmpty(title1) || TextUtils.isEmpty(title2)) { - return false; - } - return canonicalizeTitle(title1).equals(canonicalizeTitle(title2)); - } - - private static String canonicalizeTitle(String title) { - return title - .trim() - .replace('“', '"') - .replace('”', '"') - .replace('„', '"') - .replace('—', '-'); - } - /** * Adds new Feeds to the database or updates the old versions if they already exists. If another Feed with the same * identifying value already exists, this method will add new FeedItems from the new Feed to the existing Feed. diff --git a/core/src/main/java/de/danoeh/antennapod/core/storage/FeedItemDuplicateGuesser.java b/core/src/main/java/de/danoeh/antennapod/core/storage/FeedItemDuplicateGuesser.java new file mode 100644 index 000000000..35d77ae4a --- /dev/null +++ b/core/src/main/java/de/danoeh/antennapod/core/storage/FeedItemDuplicateGuesser.java @@ -0,0 +1,70 @@ +package de.danoeh.antennapod.core.storage; + +import android.text.TextUtils; +import de.danoeh.antennapod.model.feed.FeedItem; +import de.danoeh.antennapod.model.feed.FeedMedia; + +import java.text.DateFormat; +import java.util.Locale; + +/** + * Publishers sometimes mess up their feed by adding episodes twice or by changing the ID of existing episodes. + * This class tries to guess if publishers actually meant another episode, + * even if their feed explicitly says that the episodes are different. + */ +public class FeedItemDuplicateGuesser { + public static boolean seemDuplicates(FeedItem item1, FeedItem item2) { + if (sameAndNotEmpty(item1.getItemIdentifier(), item2.getItemIdentifier())) { + return true; + } + FeedMedia media1 = item1.getMedia(); + FeedMedia media2 = item2.getMedia(); + if (media1 == null || media2 == null) { + return false; + } + if (sameAndNotEmpty(media1.getStreamUrl(), media2.getStreamUrl())) { + return true; + } + return titlesLookSimilar(item1, item2) + && datesLookSimilar(item1, item2) + && durationsLookSimilar(media1, media2) + && TextUtils.equals(media1.getMime_type(), media2.getMime_type()); + } + + private static boolean sameAndNotEmpty(String string1, String string2) { + if (TextUtils.isEmpty(string1) || TextUtils.isEmpty(string2)) { + return false; + } + return string1.equals(string2); + } + + private static boolean datesLookSimilar(FeedItem item1, FeedItem item2) { + if (item1.getPubDate() == null || item2.getPubDate() == null) { + return false; + } + DateFormat dateFormat = DateFormat.getDateInstance(DateFormat.SHORT, Locale.US); // MM/DD/YY + String dateOriginal = dateFormat.format(item2.getPubDate()); + String dateNew = dateFormat.format(item1.getPubDate()); + return TextUtils.equals(dateOriginal, dateNew); // Same date; time is ignored. + } + + private static boolean durationsLookSimilar(FeedMedia media1, FeedMedia media2) { + return Math.abs(media1.getDuration() - media2.getDuration()) < 10 * 60L * 1000L; + } + + private static boolean titlesLookSimilar(FeedItem item1, FeedItem item2) { + return sameAndNotEmpty(canonicalizeTitle(item1.getTitle()), canonicalizeTitle(item2.getTitle())); + } + + private static String canonicalizeTitle(String title) { + if (title == null) { + return ""; + } + return title + .trim() + .replace('“', '"') + .replace('”', '"') + .replace('„', '"') + .replace('—', '-'); + } +} diff --git a/core/src/test/java/de/danoeh/antennapod/core/storage/FeedItemDuplicateGuesserTest.java b/core/src/test/java/de/danoeh/antennapod/core/storage/FeedItemDuplicateGuesserTest.java new file mode 100644 index 000000000..ac7cdee1f --- /dev/null +++ b/core/src/test/java/de/danoeh/antennapod/core/storage/FeedItemDuplicateGuesserTest.java @@ -0,0 +1,66 @@ +package de.danoeh.antennapod.core.storage; + +import de.danoeh.antennapod.model.feed.FeedItem; +import de.danoeh.antennapod.model.feed.FeedMedia; +import org.junit.Test; + +import java.util.Date; + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +/** + * Test class for {@link FeedItemDuplicateGuesser}. + */ +public class FeedItemDuplicateGuesserTest { + private static final long MINUTES = 1000 * 60; + private static final long DAYS = 24 * 60 * MINUTES; + + @Test + public void testSameId() { + assertTrue(FeedItemDuplicateGuesser.seemDuplicates( + item("id", "Title1", "example.com/episode1", 0, 5 * MINUTES, "audio/*"), + item("id", "Title2", "example.com/episode2", 0, 20 * MINUTES, "video/*"))); + } + + @Test + public void testDuplicateDownloadUrl() { + assertTrue(FeedItemDuplicateGuesser.seemDuplicates( + item("id1", "Title1", "example.com/episode", 0, 5 * MINUTES, "audio/*"), + item("id2", "Title2", "example.com/episode", 0, 5 * MINUTES, "audio/*"))); + assertFalse(FeedItemDuplicateGuesser.seemDuplicates( + item("id1", "Title1", "example.com/episode1", 0, 5 * MINUTES, "audio/*"), + item("id2", "Title2", "example.com/episode2", 0, 5 * MINUTES, "audio/*"))); + } + + @Test + public void testOtherAttributes() { + assertTrue(FeedItemDuplicateGuesser.seemDuplicates( + item("id1", "Title", "example.com/episode1", 10, 5 * MINUTES, "audio/*"), + item("id2", "Title", "example.com/episode2", 10, 5 * MINUTES, "audio/*"))); + assertTrue(FeedItemDuplicateGuesser.seemDuplicates( + item("id1", "Title", "example.com/episode1", 10, 5 * MINUTES, "audio/*"), + item("id2", "Title", "example.com/episode2", 20, 6 * MINUTES, "audio/*"))); + assertFalse(FeedItemDuplicateGuesser.seemDuplicates( + item("id1", "Title", "example.com/episode1", 10, 5 * MINUTES, "audio/*"), + item("id2", "Title", "example.com/episode2", 10, 5 * MINUTES, "video/*"))); + assertFalse(FeedItemDuplicateGuesser.seemDuplicates( + item("id1", "Title", "example.com/episode1", 5 * DAYS, 5 * MINUTES, "audio/*"), + item("id2", "Title", "example.com/episode2", 2 * DAYS, 5 * MINUTES, "audio/*"))); + } + + @Test + public void testNoMediaType() { + assertTrue(FeedItemDuplicateGuesser.seemDuplicates( + item("id1", "Title", "example.com/episode1", 2 * DAYS, 5 * MINUTES, ""), + item("id2", "Title", "example.com/episode2", 2 * DAYS, 5 * MINUTES, ""))); + } + + private FeedItem item(String guid, String title, String downloadUrl, + long date, long duration, String mime) { + FeedItem item = new FeedItem(0, title, guid, "link", new Date(date), FeedItem.PLAYED, null); + FeedMedia media = new FeedMedia(item, downloadUrl, duration, mime); + item.setMedia(media); + return item; + } +} \ No newline at end of file